[clang] [clang][Sema][CUDA] Restrict immediate template resolution to host-device functions (PR #200662)

Weibo He via cfe-commits Sun, 07 Jun 2026 20:25:23 -0700

https://github.com/NewSigma updated 
https://github.com/llvm/llvm-project/pull/200662


>From 65e8bdeb7207df30e0df1f182e4be2067004b516 Mon Sep 17 00:00:00 2001
From: NewSigma <[email protected]>
Date: Mon, 8 Jun 2026 11:00:49 +0800
Subject: [PATCH] [clang][Sema][CUDA] Restrict immediate template resolution to
 host-device functions

---
 clang/docs/ReleaseNotes.rst         |  1 +
 clang/include/clang/Sema/Overload.h | 18 +------
 clang/lib/Sema/SemaOverload.cpp     | 30 ++++++++++--
 clang/test/SemaCUDA/pr200545.cu     | 73 +++++++++++++++++++++++++++++
 4 files changed, 101 insertions(+), 21 deletions(-)
 create mode 100644 clang/test/SemaCUDA/pr200545.cu

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f97e90634396a..4ac3f6db2c732 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -665,6 +665,7 @@ Bug Fixes in This Version
   an array via an element-at-a-time copy loop (#GH192026)
 - Fixed an issue where certain designated initializers would be rejected for 
constexpr variables. (#GH193373)
 - Fixed a crash when ``#embed`` is used with C++ modules (#GH195350)
+- Fixed a bug where ``-x cuda`` caused clang to immediately resolve templates 
that should not be. (#GH200545)
 - Fixed an issue where ``__typeof_unqual`` and ``__typeof_unqual__`` were 
rejected as a declaration specifier in block scope in C++.
 - Fixed crash when checking for overflow for unary operator that can't 
overflow (#GH170072)
 
diff --git a/clang/include/clang/Sema/Overload.h 
b/clang/include/clang/Sema/Overload.h
index d42963e325b58..1e412ff6fc9e2 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -1353,7 +1353,7 @@ class Sema;
     bool shouldDeferDiags(Sema &S, ArrayRef<Expr *> Args, SourceLocation 
OpLoc);
 
     // Whether the resolution of template candidates should be deferred
-    bool shouldDeferTemplateArgumentDeduction(const LangOptions &Opts) const;
+    bool shouldDeferTemplateArgumentDeduction(const Sema &S) const;
 
     /// Determine when this overload candidate will be new to the
     /// overload set.
@@ -1545,22 +1545,6 @@ class Sema;
   // good candidate as we can get, despite the fact that it takes one less
   // parameter.
   bool shouldEnforceArgLimit(bool PartialOverloading, FunctionDecl *Function);
-
-  inline bool OverloadCandidateSet::shouldDeferTemplateArgumentDeduction(
-      const LangOptions &Opts) const {
-    return
-        // For user defined conversion we need to check against different
-        // combination of CV qualifiers and look at any explicit specifier, so
-        // always deduce template candidates.
-        Kind != CSK_InitByUserDefinedConversion
-        // When doing code completion, we want to see all the
-        // viable candidates.
-        && Kind != CSK_CodeCompletion
-        // CUDA may prefer template candidates even when a non-candidate
-        // is a perfect match
-        && !Opts.CUDA;
-  }
-
 } // namespace clang
 
 #endif // LLVM_CLANG_SEMA_OVERLOAD_H
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index d53fd726e9f0b..0ad938568bce2 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -8192,7 +8192,7 @@ void Sema::AddMethodTemplateCandidate(
     return;
 
   if (ExplicitTemplateArgs ||
-      !CandidateSet.shouldDeferTemplateArgumentDeduction(getLangOpts())) {
+      !CandidateSet.shouldDeferTemplateArgumentDeduction(*this)) {
     AddMethodTemplateCandidateImmediately(
         *this, CandidateSet, MethodTmpl, FoundDecl, ActingContext,
         ExplicitTemplateArgs, ObjectType, ObjectClassification, Args,
@@ -8322,7 +8322,7 @@ void Sema::AddTemplateOverloadCandidate(
   bool DependentExplicitSpecifier = hasDependentExplicit(FunctionTemplate);
 
   if (ExplicitTemplateArgs ||
-      !CandidateSet.shouldDeferTemplateArgumentDeduction(getLangOpts()) ||
+      !CandidateSet.shouldDeferTemplateArgumentDeduction(*this) ||
       (isa<CXXConstructorDecl>(FunctionTemplate->getTemplatedDecl()) &&
        DependentExplicitSpecifier)) {
 
@@ -8760,7 +8760,7 @@ void Sema::AddTemplateConversionCandidate(
   if (!CandidateSet.isNewCandidate(FunctionTemplate))
     return;
 
-  if (!CandidateSet.shouldDeferTemplateArgumentDeduction(getLangOpts()) ||
+  if (!CandidateSet.shouldDeferTemplateArgumentDeduction(*this) ||
       CandidateSet.getKind() ==
           OverloadCandidateSet::CSK_InitByUserDefinedConversion ||
       CandidateSet.getKind() == OverloadCandidateSet::CSK_InitByConstructor) {
@@ -11581,7 +11581,7 @@ OverloadingResult 
OverloadCandidateSet::BestViableFunction(Sema &S,
                                                            SourceLocation Loc,
                                                            iterator &Best) {
 
-  assert((shouldDeferTemplateArgumentDeduction(S.getLangOpts()) ||
+  assert((shouldDeferTemplateArgumentDeduction(S) ||
           DeferredCandidatesCount == 0) &&
          "Unexpected deferred template candidates");
 
@@ -13533,6 +13533,28 @@ void OverloadCandidateSet::NoteCandidates(Sema &S, 
ArrayRef<Expr *> Args,
   }
 }
 
+bool OverloadCandidateSet::shouldDeferTemplateArgumentDeduction(
+    const Sema &S) const {
+  if (S.getLangOpts().CUDA) {
+    auto *Caller = S.getCurFunctionDecl(true);
+    // Overloading based on __host__ and __device__ attributes takes
+    // higher priority, HD functions may favor template candidates even when a
+    // non-template candidate would be a perfect match.
+    if (Caller && Caller->hasAttr<CUDAHostAttr>() &&
+        Caller->hasAttr<CUDADeviceAttr>())
+      return false;
+  }
+
+  return
+      // For user defined conversion we need to check against different
+      // combination of CV qualifiers and look at any explicit specifier, so
+      // always deduce template candidates.
+      Kind != CSK_InitByUserDefinedConversion
+      // When doing code completion, we want to see all the
+      // viable candidates.
+      && Kind != CSK_CodeCompletion;
+}
+
 static SourceLocation
 GetLocationForCandidate(const TemplateSpecCandidate *Cand) {
   return Cand->Specialization ? Cand->Specialization->getLocation()
diff --git a/clang/test/SemaCUDA/pr200545.cu b/clang/test/SemaCUDA/pr200545.cu
new file mode 100644
index 0000000000000..97401d1fefccf
--- /dev/null
+++ b/clang/test/SemaCUDA/pr200545.cu
@@ -0,0 +1,73 @@
+// Test that template argument deduction is deferred correctly.
+//
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify 
-verify-ignore-unexpected=note %s
+
+#include "Inputs/cuda.h"
+
+namespace h_free_call {
+  template<class T>
+  concept DoNotDeduct = []() {
+    static_assert(sizeof(T) == 0);
+    return true;
+  }();
+
+  void fn(int) {}
+  void fn(DoNotDeduct auto) {}
+
+  void call() {
+    fn(0);
+    fn(nullptr); // expected-error@-9 {{static assertion failed due to 
requirement 'sizeof(std::nullptr_t) == 0'}}
+  }
+}
+
+namespace h_member_call {
+  template<class T>
+  concept DoNotDeduct = []() {
+    static_assert(sizeof(T) == 0);
+    return true;
+  }();
+
+  struct A {
+    void operator=(int) {}
+    void operator=(DoNotDeduct auto) {}
+  };
+
+  void call(A a) {
+    a.operator=(0);
+    a.operator=(nullptr); // expected-error@-11 {{static assertion failed due 
to requirement 'sizeof(std::nullptr_t) == 0'}}
+  }
+}
+
+namespace hd_free_call {
+  template<class T>
+  concept DoNotDeduct = []() {
+    static_assert(sizeof(T) == 0);
+    return true;
+  }();
+
+  __host__ __device__ void fn(int) {}
+  __host__ __device__ void fn(DoNotDeduct auto) {}
+
+  __host__ __device__ void call() {
+    fn(0); // expected-error@-8 {{static assertion failed due to requirement 
'sizeof(int) == 0'}}
+    fn(nullptr); // expected-error@-9 {{static assertion failed due to 
requirement 'sizeof(std::nullptr_t) == 0'}}
+  }
+}
+
+namespace hd_member_call {
+  template<class T>
+  concept DoNotDeduct = []() {
+    static_assert(sizeof(T) == 0);
+    return true;
+  }();
+
+  struct A {
+    __host__ __device__ void operator=(int) {}
+    __host__ __device__ void operator=(DoNotDeduct auto) {}
+  };
+
+  __host__ __device__ void call(A a) {
+    a.operator=(0); // expected-error@-10 {{static assertion failed due to 
requirement 'sizeof(int) == 0'}}
+    a.operator=(nullptr); // expected-error@-11 {{static assertion failed due 
to requirement 'sizeof(std::nullptr_t) == 0'}}
+  }
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [clang][Sema][CUDA] Restrict immediate template resolution to host-device functions (PR #200662)

Reply via email to