Hi Justin, This is neat!
I see a bunch of uses of std::isinf, etc. here. It tends to be important that, when using -ffast-math (or -ffinite-math-only) these checks get optimized away. Can you please check that they do? If not, you might mirror what I've done in r283051 for libc++, which is similar to what libstdc++ ends up doing, so that we use __builtin_isnan/isinf/isfinite. Thanks again, Hal ----- Original Message ----- > From: "Justin Lebar via cfe-commits" <cfe-commits@lists.llvm.org> > To: cfe-commits@lists.llvm.org > Sent: Saturday, October 8, 2016 5:16:13 PM > Subject: r283680 - [CUDA] Support <complex> and std::min/max on the device. > > Author: jlebar > Date: Sat Oct 8 17:16:12 2016 > New Revision: 283680 > > URL: http://llvm.org/viewvc/llvm-project?rev=283680&view=rev > Log: > [CUDA] Support <complex> and std::min/max on the device. > > Summary: > We do this by wrapping <complex> and <algorithm>. > > Tests are in the test-suite. > > Reviewers: tra > > Subscribers: jhen, beanz, cfe-commits, mgorny > > Differential Revision: https://reviews.llvm.org/D24979 > > Added: > cfe/trunk/lib/Headers/__clang_cuda_complex_builtins.h > cfe/trunk/lib/Headers/cuda_wrappers/ > cfe/trunk/lib/Headers/cuda_wrappers/algorithm > cfe/trunk/lib/Headers/cuda_wrappers/complex > Modified: > cfe/trunk/lib/Driver/ToolChains.cpp > cfe/trunk/lib/Headers/CMakeLists.txt > cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h > > Modified: cfe/trunk/lib/Driver/ToolChains.cpp > URL: > http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains.cpp?rev=283680&r1=283679&r2=283680&view=diff > ============================================================================== > --- cfe/trunk/lib/Driver/ToolChains.cpp (original) > +++ cfe/trunk/lib/Driver/ToolChains.cpp Sat Oct 8 17:16:12 2016 > @@ -4694,6 +4694,15 @@ void Linux::AddClangCXXStdlibIncludeArgs > > void Linux::AddCudaIncludeArgs(const ArgList &DriverArgs, > ArgStringList &CC1Args) const { > + if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { > + // Add cuda_wrappers/* to our system include path. This lets us > wrap > + // standard library headers. > + SmallString<128> P(getDriver().ResourceDir); > + llvm::sys::path::append(P, "include"); > + llvm::sys::path::append(P, "cuda_wrappers"); > + addSystemInclude(DriverArgs, CC1Args, P); > + } > + > if (DriverArgs.hasArg(options::OPT_nocudainc)) > return; > > > Modified: cfe/trunk/lib/Headers/CMakeLists.txt > URL: > http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/CMakeLists.txt?rev=283680&r1=283679&r2=283680&view=diff > ============================================================================== > --- cfe/trunk/lib/Headers/CMakeLists.txt (original) > +++ cfe/trunk/lib/Headers/CMakeLists.txt Sat Oct 8 17:16:12 2016 > @@ -24,10 +24,13 @@ set(files > bmiintrin.h > __clang_cuda_builtin_vars.h > __clang_cuda_cmath.h > + __clang_cuda_complex_builtins.h > __clang_cuda_intrinsics.h > __clang_cuda_math_forward_declares.h > __clang_cuda_runtime_wrapper.h > cpuid.h > + cuda_wrappers/algorithm > + cuda_wrappers/complex > clflushoptintrin.h > emmintrin.h > f16cintrin.h > > Added: cfe/trunk/lib/Headers/__clang_cuda_complex_builtins.h > URL: > http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_complex_builtins.h?rev=283680&view=auto > ============================================================================== > --- cfe/trunk/lib/Headers/__clang_cuda_complex_builtins.h (added) > +++ cfe/trunk/lib/Headers/__clang_cuda_complex_builtins.h Sat Oct 8 > 17:16:12 2016 > @@ -0,0 +1,203 @@ > +/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime > complex fns ---=== > + * > + * Permission is hereby granted, free of charge, to any person > obtaining a copy > + * of this software and associated documentation files (the > "Software"), to deal > + * in the Software without restriction, including without limitation > the rights > + * to use, copy, modify, merge, publish, distribute, sublicense, > and/or sell > + * copies of the Software, and to permit persons to whom the > Software is > + * furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be > included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT > SHALL THE > + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR > OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > ARISING FROM, > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > DEALINGS IN > + * THE SOFTWARE. > + * > + > *===-----------------------------------------------------------------------=== > + */ > + > +#ifndef __CLANG_CUDA_COMPLEX_BUILTINS > +#define __CLANG_CUDA_COMPLEX_BUILTINS > + > +// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3. > These are > +// libgcc functions that clang assumes are available when compiling > c99 complex > +// operations. (These implementations come from libc++, and have > been modified > +// to work with CUDA.) > + > +extern "C" inline __device__ double _Complex __muldc3(double __a, > double __b, > + double __c, > double __d) { > + double __ac = __a * __c; > + double __bd = __b * __d; > + double __ad = __a * __d; > + double __bc = __b * __c; > + double _Complex z; > + __real__(z) = __ac - __bd; > + __imag__(z) = __ad + __bc; > + if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) { > + int __recalc = 0; > + if (std::isinf(__a) || std::isinf(__b)) { > + __a = std::copysign(std::isinf(__a) ? 1 : 0, __a); > + __b = std::copysign(std::isinf(__b) ? 1 : 0, __b); > + if (std::isnan(__c)) > + __c = std::copysign(0, __c); > + if (std::isnan(__d)) > + __d = std::copysign(0, __d); > + __recalc = 1; > + } > + if (std::isinf(__c) || std::isinf(__d)) { > + __c = std::copysign(std::isinf(__c) ? 1 : 0, __c); > + __d = std::copysign(std::isinf(__d) ? 1 : 0, __d); > + if (std::isnan(__a)) > + __a = std::copysign(0, __a); > + if (std::isnan(__b)) > + __b = std::copysign(0, __b); > + __recalc = 1; > + } > + if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) || > + std::isinf(__ad) || std::isinf(__bc))) { > + if (std::isnan(__a)) > + __a = std::copysign(0, __a); > + if (std::isnan(__b)) > + __b = std::copysign(0, __b); > + if (std::isnan(__c)) > + __c = std::copysign(0, __c); > + if (std::isnan(__d)) > + __d = std::copysign(0, __d); > + __recalc = 1; > + } > + if (__recalc) { > + // Can't use std::numeric_limits<double>::infinity() -- that > doesn't have > + // a device overload (and isn't constexpr before C++11, > naturally). > + __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d); > + __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c); > + } > + } > + return z; > +} > + > +extern "C" inline __device__ float _Complex __mulsc3(float __a, > float __b, > + float __c, > float __d) { > + float __ac = __a * __c; > + float __bd = __b * __d; > + float __ad = __a * __d; > + float __bc = __b * __c; > + float _Complex z; > + __real__(z) = __ac - __bd; > + __imag__(z) = __ad + __bc; > + if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) { > + int __recalc = 0; > + if (std::isinf(__a) || std::isinf(__b)) { > + __a = std::copysign(std::isinf(__a) ? 1 : 0, __a); > + __b = std::copysign(std::isinf(__b) ? 1 : 0, __b); > + if (std::isnan(__c)) > + __c = std::copysign(0, __c); > + if (std::isnan(__d)) > + __d = std::copysign(0, __d); > + __recalc = 1; > + } > + if (std::isinf(__c) || std::isinf(__d)) { > + __c = std::copysign(std::isinf(__c) ? 1 : 0, __c); > + __d = std::copysign(std::isinf(__d) ? 1 : 0, __d); > + if (std::isnan(__a)) > + __a = std::copysign(0, __a); > + if (std::isnan(__b)) > + __b = std::copysign(0, __b); > + __recalc = 1; > + } > + if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) || > + std::isinf(__ad) || std::isinf(__bc))) { > + if (std::isnan(__a)) > + __a = std::copysign(0, __a); > + if (std::isnan(__b)) > + __b = std::copysign(0, __b); > + if (std::isnan(__c)) > + __c = std::copysign(0, __c); > + if (std::isnan(__d)) > + __d = std::copysign(0, __d); > + __recalc = 1; > + } > + if (__recalc) { > + __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d); > + __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c); > + } > + } > + return z; > +} > + > +extern "C" inline __device__ double _Complex __divdc3(double __a, > double __b, > + double __c, > double __d) { > + int __ilogbw = 0; > + // Can't use std::max, because that's defined in <algorithm>, and > we don't > + // want to pull that in for every compile. The CUDA headers > define > + // ::max(float, float) and ::max(double, double), which is > sufficient for us. > + double __logbw = std::logb(max(std::abs(__c), std::abs(__d))); > + if (std::isfinite(__logbw)) { > + __ilogbw = (int)__logbw; > + __c = std::scalbn(__c, -__ilogbw); > + __d = std::scalbn(__d, -__ilogbw); > + } > + double __denom = __c * __c + __d * __d; > + double _Complex z; > + __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, > -__ilogbw); > + __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, > -__ilogbw); > + if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) { > + if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) > { > + __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a; > + __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b; > + } else if ((std::isinf(__a) || std::isinf(__b)) && > std::isfinite(__c) && > + std::isfinite(__d)) { > + __a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a); > + __b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b); > + __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d); > + __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d); > + } else if (std::isinf(__logbw) && __logbw > 0.0 && > std::isfinite(__a) && > + std::isfinite(__b)) { > + __c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c); > + __d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d); > + __real__(z) = 0.0 * (__a * __c + __b * __d); > + __imag__(z) = 0.0 * (__b * __c - __a * __d); > + } > + } > + return z; > +} > + > +extern "C" inline __device__ float _Complex __divsc3(float __a, > float __b, > + float __c, > float __d) { > + int __ilogbw = 0; > + float __logbw = std::logb(max(std::abs(__c), std::abs(__d))); > + if (std::isfinite(__logbw)) { > + __ilogbw = (int)__logbw; > + __c = std::scalbn(__c, -__ilogbw); > + __d = std::scalbn(__d, -__ilogbw); > + } > + float __denom = __c * __c + __d * __d; > + float _Complex z; > + __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, > -__ilogbw); > + __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, > -__ilogbw); > + if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) { > + if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) { > + __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a; > + __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b; > + } else if ((std::isinf(__a) || std::isinf(__b)) && > std::isfinite(__c) && > + std::isfinite(__d)) { > + __a = std::copysign(std::isinf(__a) ? 1 : 0, __a); > + __b = std::copysign(std::isinf(__b) ? 1 : 0, __b); > + __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d); > + __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d); > + } else if (std::isinf(__logbw) && __logbw > 0 && > std::isfinite(__a) && > + std::isfinite(__b)) { > + __c = std::copysign(std::isinf(__c) ? 1 : 0, __c); > + __d = std::copysign(std::isinf(__d) ? 1 : 0, __d); > + __real__(z) = 0 * (__a * __c + __b * __d); > + __imag__(z) = 0 * (__b * __c - __a * __d); > + } > + } > + return z; > +} > + > +#endif // __CLANG_CUDA_COMPLEX_BUILTINS > > Modified: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h > URL: > http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h?rev=283680&r1=283679&r2=283680&view=diff > ============================================================================== > --- cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h (original) > +++ cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Sat Oct 8 > 17:16:12 2016 > @@ -312,6 +312,7 @@ __device__ inline __cuda_builtin_gridDim > > #include <__clang_cuda_cmath.h> > #include <__clang_cuda_intrinsics.h> > +#include <__clang_cuda_complex_builtins.h> > > // curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx > in host > // mode, giving them their "proper" types of dim3 and uint3. This > is > > Added: cfe/trunk/lib/Headers/cuda_wrappers/algorithm > URL: > http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/cuda_wrappers/algorithm?rev=283680&view=auto > ============================================================================== > --- cfe/trunk/lib/Headers/cuda_wrappers/algorithm (added) > +++ cfe/trunk/lib/Headers/cuda_wrappers/algorithm Sat Oct 8 17:16:12 > 2016 > @@ -0,0 +1,96 @@ > +/*===---- complex - CUDA wrapper for <algorithm> > ----------------------------=== > + * > + * Permission is hereby granted, free of charge, to any person > obtaining a copy > + * of this software and associated documentation files (the > "Software"), to deal > + * in the Software without restriction, including without limitation > the rights > + * to use, copy, modify, merge, publish, distribute, sublicense, > and/or sell > + * copies of the Software, and to permit persons to whom the > Software is > + * furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be > included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT > SHALL THE > + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR > OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > ARISING FROM, > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > DEALINGS IN > + * THE SOFTWARE. > + * > + > *===-----------------------------------------------------------------------=== > + */ > + > +#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM > +#define __CLANG_CUDA_WRAPPERS_ALGORITHM > + > +// This header defines __device__ overloads of std::min/max, but > only if we're > +// <= C++11. In C++14, these functions are constexpr, and so are > implicitly > +// __host__ __device__. > +// > +// We don't support the initializer_list overloads because > +// initializer_list::begin() and end() are not __host__ __device__ > functions. > +// > +// When compiling in C++14 mode, we could force std::min/max to have > different > +// implementations for host and device, by declaring the device > overloads > +// before the constexpr overloads appear. We choose not to do this > because > + > +// a) why write our own implementation when we can use one from the > standard > +// library? and > +// b) libstdc++ is evil and declares min/max inside a header that > is included > +// *before* we include <algorithm>. So we'd have to > unconditionally > +// declare our __device__ overloads of min/max, but that would > pollute > +// things for people who choose not to include <algorithm>. > + > +#include_next <algorithm> > + > +#if __cplusplus <= 201103L > + > +// We need to define these overloads in exactly the namespace our > standard > +// library uses (including the right inline namespace), otherwise > they won't be > +// picked up by other functions in the standard library (e.g. > functions in > +// <complex>). Thus the ugliness below. > +#ifdef _LIBCPP_BEGIN_NAMESPACE_STD > +_LIBCPP_BEGIN_NAMESPACE_STD > +#else > +namespace std { > +#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION > +_GLIBCXX_BEGIN_NAMESPACE_VERSION > +#endif > +#endif > + > +template <class __T, class __Cmp> > +inline __device__ const __T & > +max(const __T &__a, const __T &__b, __Cmp __cmp) { > + return __cmp(__a, __b) ? __b : __a; > +} > + > +template <class __T> > +inline __device__ const __T & > +max(const __T &__a, const __T &__b) { > + return __a < __b ? __b : __a; > +} > + > +template <class __T, class __Cmp> > +inline __device__ const __T & > +min(const __T &__a, const __T &__b, __Cmp __cmp) { > + return __cmp(__b, __a) ? __b : __a; > +} > + > +template <class __T> > +inline __device__ const __T & > +min(const __T &__a, const __T &__b) { > + return __a < __b ? __b : __a; > +} > + > +#ifdef _LIBCPP_END_NAMESPACE_STD > +_LIBCPP_END_NAMESPACE_STD > +#else > +#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION > +_GLIBCXX_END_NAMESPACE_VERSION > +#endif > +} // namespace std > +#endif > + > +#endif // __cplusplus <= 201103L > +#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM > > Added: cfe/trunk/lib/Headers/cuda_wrappers/complex > URL: > http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/cuda_wrappers/complex?rev=283680&view=auto > ============================================================================== > --- cfe/trunk/lib/Headers/cuda_wrappers/complex (added) > +++ cfe/trunk/lib/Headers/cuda_wrappers/complex Sat Oct 8 17:16:12 > 2016 > @@ -0,0 +1,79 @@ > +/*===---- complex - CUDA wrapper for <complex> > ------------------------------=== > + * > + * Permission is hereby granted, free of charge, to any person > obtaining a copy > + * of this software and associated documentation files (the > "Software"), to deal > + * in the Software without restriction, including without limitation > the rights > + * to use, copy, modify, merge, publish, distribute, sublicense, > and/or sell > + * copies of the Software, and to permit persons to whom the > Software is > + * furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be > included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT > SHALL THE > + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR > OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > ARISING FROM, > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > DEALINGS IN > + * THE SOFTWARE. > + * > + > *===-----------------------------------------------------------------------=== > + */ > + > +#pragma once > + > +// Wrapper around <complex> that forces its functions to be __host__ > +// __device__. > + > +// First, include host-only headers we think are likely to be > included by > +// <complex>, so that the pragma below only applies to <complex> > itself. > +#if __cplusplus >= 201103L > +#include <type_traits> > +#endif > +#include <stdexcept> > +#include <cmath> > +#include <sstream> > + > +// Next, include our <algorithm> wrapper, to ensure that device > overloads of > +// std::min/max are available. > +#include <algorithm> > + > +#pragma clang force_cuda_host_device begin > + > +// When compiling for device, ask libstdc++ to use its own > implements of > +// complex functions, rather than calling builtins (which resolve to > library > +// functions that don't exist when compiling CUDA device code). > +// > +// This is a little dicey, because it causes libstdc++ to define a > different > +// set of overloads on host and device. > +// > +// // Present only when compiling for host. > +// __host__ __device__ void complex<float> sin(const > complex<float>& x) { > +// return __builtin_csinf(x); > +// } > +// > +// // Present when compiling for host and for device. > +// template <typename T> > +// void __host__ __device__ complex<T> sin(const complex<T>& x) { > +// return complex<T>(sin(x.real()) * cosh(x.imag()), > +// cos(x.real()), sinh(x.imag())); > +// } > +// > +// This is safe because when compiling for device, all function > calls in > +// __host__ code to sin() will still resolve to *something*, even if > they don't > +// resolve to the same function as they resolve to when compiling > for host. We > +// don't care that they don't resolve to the right function because > we won't > +// codegen this host code when compiling for device. > + > +#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX") > +#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1") > +#define _GLIBCXX_USE_C99_COMPLEX 0 > +#define _GLIBCXX_USE_C99_COMPLEX_TR1 0 > + > +#include_next <complex> > + > +#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1") > +#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX") > + > +#pragma clang force_cuda_host_device end > > > _______________________________________________ > cfe-commits mailing list > cfe-commits@lists.llvm.org > http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits > -- Hal Finkel Lead, Compiler Technology and Programming Languages Leadership Computing Facility Argonne National Laboratory _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits