This revision was automatically updated to reflect the committed changes. Closed by commit rGc77a4078e010: [CUDA][HIP] Fix host/device based overload resolution (authored by yaxunl). Herald added a project: clang.
Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D77954/new/ https://reviews.llvm.org/D77954 Files: clang/lib/Sema/SemaOverload.cpp clang/test/SemaCUDA/function-overload.cu
Index: clang/test/SemaCUDA/function-overload.cu =================================================================== --- clang/test/SemaCUDA/function-overload.cu +++ clang/test/SemaCUDA/function-overload.cu @@ -331,9 +331,6 @@ // If we have a mix of HD and H-only or D-only candidates in the overload set, // normal C++ overload resolution rules apply first. template <typename T> TemplateReturnTy template_vs_hd_function(T arg) -#ifdef __CUDA_ARCH__ -//expected-note@-2 {{declared here}} -#endif { return TemplateReturnTy(); } @@ -342,11 +339,13 @@ } __host__ __device__ void test_host_device_calls_hd_template() { - HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f); - TemplateReturnTy ret2 = template_vs_hd_function(1); #ifdef __CUDA_ARCH__ - // expected-error@-2 {{reference to __host__ function 'template_vs_hd_function<int>' in __host__ __device__ function}} + typedef HostDeviceReturnTy ExpectedReturnTy; +#else + typedef TemplateReturnTy ExpectedReturnTy; #endif + HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f); + ExpectedReturnTy ret2 = template_vs_hd_function(1); } __host__ void test_host_calls_hd_template() { @@ -367,14 +366,14 @@ __device__ DeviceReturnTy device_only_function(int arg) { return DeviceReturnTy(); } __device__ DeviceReturnTy2 device_only_function(float arg) { return DeviceReturnTy2(); } #ifndef __CUDA_ARCH__ - // expected-note@-3 {{'device_only_function' declared here}} - // expected-note@-3 {{'device_only_function' declared here}} + // expected-note@-3 2{{'device_only_function' declared here}} + // expected-note@-3 2{{'device_only_function' declared here}} #endif __host__ HostReturnTy host_only_function(int arg) { return HostReturnTy(); } __host__ HostReturnTy2 host_only_function(float arg) { return HostReturnTy2(); } #ifdef __CUDA_ARCH__ - // expected-note@-3 {{'host_only_function' declared here}} - // expected-note@-3 {{'host_only_function' declared here}} + // expected-note@-3 2{{'host_only_function' declared here}} + // expected-note@-3 2{{'host_only_function' declared here}} #endif __host__ __device__ void test_host_device_single_side_overloading() { @@ -392,6 +391,37 @@ #endif } +// wrong-sided overloading should not cause diagnostic unless it is emitted. +// This inline function is not emitted. +inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_no_diag() { + DeviceReturnTy ret1 = device_only_function(1); + DeviceReturnTy2 ret2 = device_only_function(1.0f); + HostReturnTy ret3 = host_only_function(1); + HostReturnTy2 ret4 = host_only_function(1.0f); +} + +// wrong-sided overloading should cause diagnostic if it is emitted. +// This inline function is emitted since it is called by an emitted function. +inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_diag() { + DeviceReturnTy ret1 = device_only_function(1); + DeviceReturnTy2 ret2 = device_only_function(1.0f); +#ifndef __CUDA_ARCH__ + // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}} + // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}} +#endif + HostReturnTy ret3 = host_only_function(1); + HostReturnTy2 ret4 = host_only_function(1.0f); +#ifdef __CUDA_ARCH__ + // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}} + // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}} +#endif +} + +__host__ __device__ void test_host_device_wrong_side_overloading_inline_diag_caller() { + test_host_device_wrong_side_overloading_inline_diag(); + // expected-note@-1 {{called by 'test_host_device_wrong_side_overloading_inline_diag_caller'}} +} + // Verify that we allow overloading function templates. template <typename T> __host__ T template_overload(const T &a) { return a; }; template <typename T> __device__ T template_overload(const T &a) { return a; }; Index: clang/lib/Sema/SemaOverload.cpp =================================================================== --- clang/lib/Sema/SemaOverload.cpp +++ clang/lib/Sema/SemaOverload.cpp @@ -9374,16 +9374,22 @@ return Comparison::Equal; } -static bool isBetterMultiversionCandidate(const OverloadCandidate &Cand1, - const OverloadCandidate &Cand2) { +static Comparison +isBetterMultiversionCandidate(const OverloadCandidate &Cand1, + const OverloadCandidate &Cand2) { if (!Cand1.Function || !Cand1.Function->isMultiVersion() || !Cand2.Function || !Cand2.Function->isMultiVersion()) - return false; + return Comparison::Equal; - // If Cand1 is invalid, it cannot be a better match, if Cand2 is invalid, this - // is obviously better. - if (Cand1.Function->isInvalidDecl()) return false; - if (Cand2.Function->isInvalidDecl()) return true; + // If both are invalid, they are equal. If one of them is invalid, the other + // is better. + if (Cand1.Function->isInvalidDecl()) { + if (Cand2.Function->isInvalidDecl()) + return Comparison::Equal; + return Comparison::Worse; + } + if (Cand2.Function->isInvalidDecl()) + return Comparison::Better; // If this is a cpu_dispatch/cpu_specific multiversion situation, prefer // cpu_dispatch, else arbitrarily based on the identifiers. @@ -9393,16 +9399,18 @@ const auto *Cand2CPUSpec = Cand2.Function->getAttr<CPUSpecificAttr>(); if (!Cand1CPUDisp && !Cand2CPUDisp && !Cand1CPUSpec && !Cand2CPUSpec) - return false; + return Comparison::Equal; if (Cand1CPUDisp && !Cand2CPUDisp) - return true; + return Comparison::Better; if (Cand2CPUDisp && !Cand1CPUDisp) - return false; + return Comparison::Worse; if (Cand1CPUSpec && Cand2CPUSpec) { if (Cand1CPUSpec->cpus_size() != Cand2CPUSpec->cpus_size()) - return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size(); + return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size() + ? Comparison::Better + : Comparison::Worse; std::pair<CPUSpecificAttr::cpus_iterator, CPUSpecificAttr::cpus_iterator> FirstDiff = std::mismatch( @@ -9415,7 +9423,9 @@ assert(FirstDiff.first != Cand1CPUSpec->cpus_end() && "Two different cpu-specific versions should not have the same " "identifier list, otherwise they'd be the same decl!"); - return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName(); + return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName() + ? Comparison::Better + : Comparison::Worse; } llvm_unreachable("No way to get here unless both had cpu_dispatch"); } @@ -9475,6 +9485,50 @@ else if (!Cand1.Viable) return false; + // [CUDA] A function with 'never' preference is marked not viable, therefore + // is never shown up here. The worst preference shown up here is 'wrong side', + // e.g. a host function called by a device host function in device + // compilation. This is valid AST as long as the host device function is not + // emitted, e.g. it is an inline function which is called only by a host + // function. A deferred diagnostic will be triggered if it is emitted. + // However a wrong-sided function is still a viable candidate here. + // + // If Cand1 can be emitted and Cand2 cannot be emitted in the current + // context, Cand1 is better than Cand2. If Cand1 can not be emitted and Cand2 + // can be emitted, Cand1 is not better than Cand2. This rule should have + // precedence over other rules. + // + // If both Cand1 and Cand2 can be emitted, or neither can be emitted, then + // other rules should be used to determine which is better. This is because + // host/device based overloading resolution is mostly for determining + // viability of a function. If two functions are both viable, other factors + // should take precedence in preference, e.g. the standard-defined preferences + // like argument conversion ranks or enable_if partial-ordering. The + // preference for pass-object-size parameters is probably most similar to a + // type-based-overloading decision and so should take priority. + // + // If other rules cannot determine which is better, CUDA preference will be + // used again to determine which is better. + // + // TODO: Currently IdentifyCUDAPreference does not return correct values + // for functions called in global variable initializers due to missing + // correct context about device/host. Therefore we can only enforce this + // rule when there is a caller. We should enforce this rule for functions + // in global variable initializers once proper context is added. + if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) { + if (FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext)) { + auto P1 = S.IdentifyCUDAPreference(Caller, Cand1.Function); + auto P2 = S.IdentifyCUDAPreference(Caller, Cand2.Function); + assert(P1 != Sema::CFP_Never && P2 != Sema::CFP_Never); + auto Cand1Emittable = P1 > Sema::CFP_WrongSide; + auto Cand2Emittable = P2 > Sema::CFP_WrongSide; + if (Cand1Emittable && !Cand2Emittable) + return true; + if (!Cand1Emittable && Cand2Emittable) + return false; + } + } + // C++ [over.match.best]p1: // // -- if F is a static member function, ICS1(F) is defined such @@ -9709,12 +9763,6 @@ return Cmp == Comparison::Better; } - if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) { - FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext); - return S.IdentifyCUDAPreference(Caller, Cand1.Function) > - S.IdentifyCUDAPreference(Caller, Cand2.Function); - } - bool HasPS1 = Cand1.Function != nullptr && functionHasPassObjectSizeParams(Cand1.Function); bool HasPS2 = Cand2.Function != nullptr && @@ -9722,7 +9770,22 @@ if (HasPS1 != HasPS2 && HasPS1) return true; - return isBetterMultiversionCandidate(Cand1, Cand2); + auto MV = isBetterMultiversionCandidate(Cand1, Cand2); + if (MV == Comparison::Better) + return true; + if (MV == Comparison::Worse) + return false; + + // If other rules cannot determine which is better, CUDA preference is used + // to determine which is better. + if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) { + if (FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext)) { + return S.IdentifyCUDAPreference(Caller, Cand1.Function) > + S.IdentifyCUDAPreference(Caller, Cand2.Function); + } + } + + return false; } /// Determine whether two declarations are "equivalent" for the purposes of @@ -9808,33 +9871,6 @@ std::transform(begin(), end(), std::back_inserter(Candidates), [](OverloadCandidate &Cand) { return &Cand; }); - // [CUDA] HD->H or HD->D calls are technically not allowed by CUDA but - // are accepted by both clang and NVCC. However, during a particular - // compilation mode only one call variant is viable. We need to - // exclude non-viable overload candidates from consideration based - // only on their host/device attributes. Specifically, if one - // candidate call is WrongSide and the other is SameSide, we ignore - // the WrongSide candidate. - if (S.getLangOpts().CUDA) { - const FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext); - bool ContainsSameSideCandidate = - llvm::any_of(Candidates, [&](OverloadCandidate *Cand) { - // Check viable function only. - return Cand->Viable && Cand->Function && - S.IdentifyCUDAPreference(Caller, Cand->Function) == - Sema::CFP_SameSide; - }); - if (ContainsSameSideCandidate) { - auto IsWrongSideCandidate = [&](OverloadCandidate *Cand) { - // Check viable function only to avoid unnecessary data copying/moving. - return Cand->Viable && Cand->Function && - S.IdentifyCUDAPreference(Caller, Cand->Function) == - Sema::CFP_WrongSide; - }; - llvm::erase_if(Candidates, IsWrongSideCandidate); - } - } - // Find the best viable function. Best = end(); for (auto *Cand : Candidates) {
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits