Attaching an updated patch, incorporating comments from the PR. I still can't figure out how to merge and upload from hg/sourcetree, maybe some issue with my personal clone of the repository.
On Mon, Nov 4, 2019 at 11:11 PM Sam Hasinoff <[email protected]> wrote: > Attaching a small patch: > Add a new EIGEN_HAS_INTRINSIC_INT128 macro, and use this instead of > __SIZEOF_INT128__. This fixes related issues with TensorIntDiv.h when > building with Clang for Windows, where support for 128-bit integer > arithmetic is advertised but broken in practice. > > I'm new to bitbucket and hg, and I couldn't figure out how to resolve the > merge conflict in my related pull request manually: > > https://bitbucket.org/eigen/eigen/pull-requests/752/add-eigen_has_intrinsic_int128-macro/diff > or how to push the attached patch using hg CLI tools. > > Thanks in advance, > Sam >
# HG changeset patch # User Sam Hasinoff <[email protected]> # Date 1573077159 28800 # Wed Nov 06 13:52:39 2019 -0800 # Node ID 36e3e863e06d5fc805cace990b86acfd1959e3d4 # Parent afc120bc03bdc4265858d6f86218eb1fed51b1b9 Add EIGEN_HAS_INTRINSIC_INT128 macro Add a new EIGEN_HAS_INTRINSIC_INT128 macro, and use this instead of __SIZEOF_INT128__. This fixes related issues with TensorIntDiv.h when building with Clang for Windows, where support for 128-bit integer arithmetic is advertised but broken in practice. diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -752,16 +752,32 @@ #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC #endif #elif defined(__clang__) && defined(__CUDA__) && __has_feature(cxx_relaxed_constexpr) // clang++ always considers constexpr functions as implicitly __host__ __device__ #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC #endif #endif +// Does the compiler support the __int128 and __uint128_t extensions for 128-bit +// integer arithmetic? +// +// Clang and GCC define __SIZEOF_INT128__ when these extensions are supported, +// but we avoid using them in certain cases: +// +// * Building using Clang for Windows, where the Clang runtime library has +// 128-bit support only on LP64 architectures, but Windows is LLP64. +#ifndef EIGEN_HAS_BUILTIN_INT128 +#if defined(__SIZEOF_INT128__) && !(EIGEN_OS_WIN && EIGEN_COMP_CLANG) +#define EIGEN_HAS_BUILTIN_INT128 1 +#else +#define EIGEN_HAS_BUILTIN_INT128 0 +#endif +#endif + //------------------------------------------------------------------------------------------ // Preprocessor programming helpers //------------------------------------------------------------------------------------------ // This macro can be used to prevent from macro expansion, e.g.: // std::max EIGEN_NOT_A_MACRO(a,b) #define EIGEN_NOT_A_MACRO diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -100,17 +100,17 @@ namespace { } template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { #if defined(EIGEN_GPU_COMPILE_PHASE) return __umul64hi(a, b); #elif defined(SYCL_DEVICE_ONLY) return cl::sycl::mul_hi(a, static_cast<uint64_t>(b)); -#elif defined(__SIZEOF_INT128__) +#elif EIGEN_HAS_BUILTIN_INT128 __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); return static_cast<uint64_t>(v >> 64); #else return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper(); #endif } template <int N, typename T> @@ -119,17 +119,17 @@ namespace { EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE); return static_cast<uint32_t>((static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1); } }; template <typename T> struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) +#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1); return static_cast<uint64_t>(result); #endif
