https://github.com/justinfargnoli updated 
https://github.com/llvm/llvm-project/pull/174834

>From 09e2a90a4d547fd0961bbb9fea88311610556032 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <[email protected]>
Date: Wed, 7 Jan 2026 02:51:02 +0000
Subject: [PATCH 1/4] [NVPTX] Validate user-specified PTX version against SM
 version

---
 llvm/lib/Target/NVPTX/NVPTX.td                | 83 ++++++++---------
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp      | 90 ++++++++++++++++++-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h        |  5 +-
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll     |  4 +-
 llvm/test/CodeGen/NVPTX/convert-sm103a.ll     |  4 +-
 llvm/test/CodeGen/NVPTX/f32-ex2.ll            |  4 +-
 llvm/test/CodeGen/NVPTX/fexp2.ll              |  4 +-
 llvm/test/CodeGen/NVPTX/flog2.ll              |  4 +-
 llvm/test/CodeGen/NVPTX/i128.ll               |  4 +-
 .../CodeGen/NVPTX/nvvm-reflect-arch-O0.ll     |  2 +-
 .../CodeGen/NVPTX/ptx-version-validation.ll   | 51 +++++++++++
 llvm/test/CodeGen/NVPTX/rsqrt.ll              |  4 +-
 llvm/test/CodeGen/NVPTX/sm-version.ll         |  2 +-
 llvm/test/CodeGen/NVPTX/surf-tex.py           |  4 +-
 llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py  | 14 ---
 15 files changed, 202 insertions(+), 77 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/ptx-version-validation.ll
 delete mode 100644 llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py

diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index d41a43de95098..bc2d67c9769dc 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -68,10 +68,11 @@ class FeaturePTX<int version>:
 // represents 'z'), sm_103f, and sm_103 architecture variants. The sm_103 is
 // compatible with sm_103a and sm_103f, and sm_103f is compatible with sm_103a.
 //
-// Encoding := Arch * 10 + 2 (for 'f') + 1 (for 'a')
+// Encoding := Arch * 10 + suffix_offset
 // Arch := X * 10 + Y
+// suffix_offset := 0 (base), 2 ('f'), or 3 ('a')
 //
-// For example, sm_103a is encoded as 1033 (103 * 10 + 2 + 1) and sm_103f is
+// For example, sm_103a is encoded as 1033 (103 * 10 + 3) and sm_103f is
 // encoded as 1032 (103 * 10 + 2).
 //
 // This encoding allows simple partial ordering of the architectures.
@@ -109,46 +110,46 @@ foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 
63, 64, 65, 70, 71, 72,
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
-def : Proc<"sm_20",   [SM20, PTX32]>;
-def : Proc<"sm_21",   [SM21, PTX32]>;
+def : Proc<"sm_20",   [SM20]>;
+def : Proc<"sm_21",   [SM21]>;
 def : Proc<"sm_30",   [SM30]>;
-def : Proc<"sm_32",   [SM32, PTX40]>;
-def : Proc<"sm_35",   [SM35, PTX32]>;
-def : Proc<"sm_37",   [SM37, PTX41]>;
-def : Proc<"sm_50",   [SM50, PTX40]>;
-def : Proc<"sm_52",   [SM52, PTX41]>;
-def : Proc<"sm_53",   [SM53, PTX42]>;
-def : Proc<"sm_60",   [SM60, PTX50]>;
-def : Proc<"sm_61",   [SM61, PTX50]>;
-def : Proc<"sm_62",   [SM62, PTX50]>;
-def : Proc<"sm_70",   [SM70, PTX60]>;
-def : Proc<"sm_72",   [SM72, PTX61]>;
-def : Proc<"sm_75",   [SM75, PTX63]>;
-def : Proc<"sm_80",   [SM80, PTX70]>;
-def : Proc<"sm_86",   [SM86, PTX71]>;
-def : Proc<"sm_87",   [SM87, PTX74]>;
-def : Proc<"sm_88",   [SM88, PTX90]>;
-def : Proc<"sm_89",   [SM89, PTX78]>;
-def : Proc<"sm_90",   [SM90, PTX78]>;
-def : Proc<"sm_90a",  [SM90a, PTX80]>;
-def : Proc<"sm_100",  [SM100, PTX86]>;
-def : Proc<"sm_100a", [SM100a, PTX86]>;
-def : Proc<"sm_100f", [SM100f, PTX88]>;
-def : Proc<"sm_101",  [SM101, PTX86]>;
-def : Proc<"sm_101a", [SM101a, PTX86]>;
-def : Proc<"sm_101f", [SM101f, PTX88]>;
-def : Proc<"sm_103",  [SM103, PTX88]>;
-def : Proc<"sm_103a", [SM103a, PTX88]>;
-def : Proc<"sm_103f", [SM103f, PTX88]>;
-def : Proc<"sm_110",  [SM110, PTX90]>;
-def : Proc<"sm_110a", [SM110a, PTX90]>;
-def : Proc<"sm_110f", [SM110f, PTX90]>;
-def : Proc<"sm_120",  [SM120, PTX87]>;
-def : Proc<"sm_120a", [SM120a, PTX87]>;
-def : Proc<"sm_120f", [SM120f, PTX88]>;
-def : Proc<"sm_121",  [SM121, PTX88]>;
-def : Proc<"sm_121a", [SM121a, PTX88]>;
-def : Proc<"sm_121f", [SM121f, PTX88]>;
+def : Proc<"sm_32",   [SM32]>;
+def : Proc<"sm_35",   [SM35]>;
+def : Proc<"sm_37",   [SM37]>;
+def : Proc<"sm_50",   [SM50]>;
+def : Proc<"sm_52",   [SM52]>;
+def : Proc<"sm_53",   [SM53]>;
+def : Proc<"sm_60",   [SM60]>;
+def : Proc<"sm_61",   [SM61]>;
+def : Proc<"sm_62",   [SM62]>;
+def : Proc<"sm_70",   [SM70]>;
+def : Proc<"sm_72",   [SM72]>;
+def : Proc<"sm_75",   [SM75]>;
+def : Proc<"sm_80",   [SM80]>;
+def : Proc<"sm_86",   [SM86]>;
+def : Proc<"sm_87",   [SM87]>;
+def : Proc<"sm_88",   [SM88]>;
+def : Proc<"sm_89",   [SM89]>;
+def : Proc<"sm_90",   [SM90]>;
+def : Proc<"sm_90a",  [SM90a]>;
+def : Proc<"sm_100",  [SM100]>;
+def : Proc<"sm_100a", [SM100a]>;
+def : Proc<"sm_100f", [SM100f]>;
+def : Proc<"sm_101",  [SM101]>;
+def : Proc<"sm_101a", [SM101a]>;
+def : Proc<"sm_101f", [SM101f]>;
+def : Proc<"sm_103",  [SM103]>;
+def : Proc<"sm_103a", [SM103a]>;
+def : Proc<"sm_103f", [SM103f]>;
+def : Proc<"sm_110",  [SM110]>;
+def : Proc<"sm_110a", [SM110a]>;
+def : Proc<"sm_110f", [SM110f]>;
+def : Proc<"sm_120",  [SM120]>;
+def : Proc<"sm_120a", [SM120a]>;
+def : Proc<"sm_120f", [SM120f]>;
+def : Proc<"sm_121",  [SM121]>;
+def : Proc<"sm_121a", [SM121a]>;
+def : Proc<"sm_121f", [SM121f]>;
 
 
 def Is64Bit : Predicate<"Subtarget->getTargetTriple().getArch() == 
Triple::nvptx64">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp 
b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 989be50d45554..00b21c0ef21ea 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -38,6 +38,82 @@ static cl::opt<bool> NoF32x2("nvptx-no-f32x2", cl::Hidden,
 // Pin the vtable to this file.
 void NVPTXSubtarget::anchor() {}
 
+// Returns the minimum PTX version required for a given SM target.
+// This must be kept in sync with the "Supported Targets" column of the
+// "PTX Release History" table in the PTX ISA documentation:
+// 
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes-ptx-release-history
+//
+// Note: LLVM's minimum supported PTX version is 3.2 (see FeaturePTX in
+// NVPTX.td), so older SMs that supported earlier PTX versions instead use 3.2
+// as their effective minimum.
+//
+// The FullSmVersion encoding is: SM * 10 + suffix_offset
+// where suffix_offset is: 0 (base), 2 ('f' suffix), or 3 ('a' suffix)
+// For example: sm_100 = 1000, sm_100f = 1002, sm_100a = 1003
+static unsigned getMinPTXVersionForSM(unsigned FullSmVersion) {
+  switch (FullSmVersion) {
+  case 200: // sm_20
+  case 210: // sm_21
+  case 300: // sm_30
+  case 350: // sm_35
+    return 32;
+  case 320: // sm_32
+  case 500: // sm_50
+    return 40;
+  case 370: // sm_37
+  case 520: // sm_52
+    return 41;
+  case 530: // sm_53
+    return 42;
+  case 600: // sm_60
+  case 610: // sm_61
+  case 620: // sm_62
+    return 50;
+  case 700: // sm_70
+    return 60;
+  case 720: // sm_72
+    return 61;
+  case 750: // sm_75
+    return 63;
+  case 800: // sm_80
+    return 70;
+  case 860: // sm_86
+    return 71;
+  case 870: // sm_87
+    return 74;
+  case 890: // sm_89
+  case 900: // sm_90
+    return 78;
+  case 903: // sm_90a
+    return 80;
+  case 1000: // sm_100
+  case 1003: // sm_100a
+  case 1010: // sm_101
+  case 1013: // sm_101a
+    return 86;
+  case 1200: // sm_120
+  case 1203: // sm_120a
+    return 87;
+  case 1002: // sm_100f
+  case 1012: // sm_101f
+  case 1030: // sm_103
+  case 1032: // sm_103f
+  case 1033: // sm_103a
+  case 1202: // sm_120f
+  case 1210: // sm_121
+  case 1212: // sm_121f
+  case 1213: // sm_121a
+    return 88;
+  case 880:  // sm_88
+  case 1100: // sm_110
+  case 1102: // sm_110f
+  case 1103: // sm_110a
+    return 90;
+  default:
+    llvm_unreachable("Unknown SM version");
+  }
+}
+
 NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
   TargetName = std::string(CPU);
@@ -49,9 +125,19 @@ NVPTXSubtarget 
&NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
   // sm_90a, which would *not* be a subset of sm_91.
   SmVersion = getSmVersion();
 
-  // Set default to PTX 6.0 (CUDA 9.0)
+  unsigned MinPTX = getMinPTXVersionForSM(FullSmVersion);
+
   if (PTXVersion == 0) {
-    PTXVersion = 60;
+    // User didn't request a specific PTX version; use the minimum for this SM.
+    PTXVersion = MinPTX;
+  } else if (PTXVersion < MinPTX) {
+    // User explicitly requested an insufficient PTX version.
+    report_fatal_error(formatv(
+        "PTX version {0}.{1} does not support target '{2}'. "
+        "Minimum required PTX version is {3}.{4}. "
+        "Either remove the PTX version to use the default, "
+        "or increase it to at least {3}.{4}.",
+        PTXVersion / 10, PTXVersion % 10, CPU, MinPTX / 10, MinPTX % 10));
   }
 
   return *this;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h 
b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 5f426bf1a15f9..b6666ae429f44 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -36,8 +36,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
   unsigned PTXVersion;
 
-  // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310
-  // sm_90a == 901
+  // FullSmVersion is SM * 10 + suffix_offset
+  // where suffix_offset is: 0 (base), 2 ('f'), or 3 ('a')
+  // e.g. sm_30 == 300, sm_90a == 903, sm_100f == 1002
   unsigned int FullSmVersion;
 
   // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll 
b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll
index 16bd0da8c6a0c..cbf7c114b06ca 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | FileCheck %s
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | FileCheck %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx87 | FileCheck %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %}
-; RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %}
+; RUN: %if ptxas-sm_120a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_120a -mattr=+ptx87 | %ptxas-verify -arch=sm_120a %}
 
 define i16 @cvt_rn_sf_e2m3x2_f32(float %f1, float %f2) {
 ; CHECK-LABEL: cvt_rn_sf_e2m3x2_f32(
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm103a.ll 
b/llvm/test/CodeGen/NVPTX/convert-sm103a.ll
index 54b4dd88867ed..b58c8b3e7abc5 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm103a.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm103a.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx87 | FileCheck %s
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx87 | FileCheck %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_100a -mattr=+ptx87 | %ptxas-verify -arch=sm_100a %}
-; RUN: %if ptxas-sm_103a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_103a -mattr=+ptx87 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
 
 ; F16X2 conversions
 
diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll 
b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
index 97b9d35be371e..db3dd4a9e6011 100644
--- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
-; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
-; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %}
+; RUN: llc < %s -mcpu=sm_50 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_50 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 | 
%ptxas-verify -arch=sm_50 %}
 target triple = "nvptx-nvidia-cuda"
 
 declare float @llvm.nvvm.ex2.approx.f32(float)
diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll
index d9e82cc372e24..047e4bbc3fa32 100644
--- a/llvm/test/CodeGen/NVPTX/fexp2.ll
+++ b/llvm/test/CodeGen/NVPTX/fexp2.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
-; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -mcpu=sm_50 | FileCheck --check-prefixes=CHECK %s
 ; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck 
--check-prefixes=CHECK-FP16 %s
 ; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck 
--check-prefixes=CHECK-BF16 %s
-; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 
| %ptxas-verify -arch=sm_50 %}
+; RUN: %if ptxas-sm_50 %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %}
 ; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 
| %ptxas-verify -arch=sm_75 %}
 ; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 
| %ptxas-verify -arch=sm_90 %}
 target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll
index 4aafc986db1d9..66e92e3428ff9 100644
--- a/llvm/test/CodeGen/NVPTX/flog2.ll
+++ b/llvm/test/CodeGen/NVPTX/flog2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
-; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | FileCheck 
--check-prefixes=CHECK %s
-; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 
-nvptx-approx-log2f32 | %ptxas-verify -arch=sm_50 %}
+; RUN: llc < %s -mcpu=sm_50 -nvptx-approx-log2f32 | FileCheck 
--check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_50 %{ llc < %s -mcpu=sm_50 -nvptx-approx-log2f32 | 
%ptxas-verify -arch=sm_50 %}
 target triple = "nvptx64-nvidia-cuda"
 
 ; CHECK-LABEL: log2_test
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index 5726c2a5bbb16..9d82292852d84 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=nvptx64-- 2>&1 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-- | %ptxas-verify %}
+; RUN: llc < %s -mtriple=nvptx64-- -mattr=+ptx60 2>&1 | FileCheck %s
+; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64-- -mattr=+ptx60 | 
%ptxas-verify %}
 
 define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: srem_i128(
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll 
b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
index a7f3103e5fcbb..cdbf3c3305305 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 -O0 | FileCheck %s 
--check-prefixes=SM_52,COMMON
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx64 -O0 | FileCheck %s 
--check-prefixes=SM_70,COMMON
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx72 -O0 | FileCheck %s 
--check-prefixes=SM_90,COMMON
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -O0 | FileCheck %s 
--check-prefixes=SM_90,COMMON
 
 @.str = private unnamed_addr constant [12 x i8] c"__CUDA_ARCH\00"
 @.str1 = constant [11 x i8] c"__CUDA_FTZ\00"
diff --git a/llvm/test/CodeGen/NVPTX/ptx-version-validation.ll 
b/llvm/test/CodeGen/NVPTX/ptx-version-validation.ll
new file mode 100644
index 0000000000000..12614e3ef848f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/ptx-version-validation.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx90 2>&1 | FileCheck 
%s --check-prefix=CHECK-SM103A-HIGH
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a 2>&1 | FileCheck %s 
--check-prefix=CHECK-SM103A
+; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx87 2>&1 | 
FileCheck %s --check-prefix=CHECK-SM103A-LOW
+; RUN: %if ptxas-sm_103a && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_103a -mattr=+ptx90 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_103a %{ llc < %s -mtriple=nvptx64 -mcpu=sm_103a | 
%ptxas-verify -arch=sm_103a %}
+
+; Test that sm_120a defaults/requires PTX 8.7
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a 2>&1 | FileCheck %s 
--check-prefix=CHECK-SM120A
+; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 2>&1 | 
FileCheck %s --check-prefix=CHECK-SM120A-LOW
+; RUN: %if ptxas-sm_120a %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a | 
%ptxas-verify -arch=sm_120a %}
+
+; Test that sm_90a defaults/requires PTX 8.0
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90a 2>&1 | FileCheck %s 
--check-prefix=CHECK-SM90A
+; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx78 2>&1 | 
FileCheck %s --check-prefix=CHECK-SM90A-LOW
+; RUN: %if ptxas-sm_90a %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90a | 
%ptxas-verify -arch=sm_90a %}
+
+; Test older SM targets
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 2>&1 | FileCheck %s 
--check-prefix=CHECK-SM80
+; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx63 2>&1 | 
FileCheck %s --check-prefix=CHECK-SM80-LOW
+; RUN: %if ptxas-sm_80 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | 
%ptxas-verify -arch=sm_80 %}
+
+; CHECK-SM103A-HIGH: .version 9.0
+; CHECK-SM103A-HIGH: .target sm_103a
+
+; CHECK-SM103A: .version 8.8
+; CHECK-SM103A: .target sm_103a
+
+; CHECK-SM103A-LOW: LLVM ERROR: PTX version 8.7 does not support target 
'sm_103a'.
+; CHECK-SM103A-LOW: Minimum required PTX version is 8.8.
+
+; CHECK-SM120A: .version 8.7
+; CHECK-SM120A: .target sm_120a
+
+; CHECK-SM120A-LOW: LLVM ERROR: PTX version 8.6 does not support target 
'sm_120a'.
+; CHECK-SM120A-LOW: Minimum required PTX version is 8.7.
+
+; CHECK-SM90A: .version 8.0
+; CHECK-SM90A: .target sm_90a
+
+; CHECK-SM90A-LOW: LLVM ERROR: PTX version 7.8 does not support target 
'sm_90a'.
+; CHECK-SM90A-LOW: Minimum required PTX version is 8.0.
+
+; CHECK-SM80: .version 7.0
+; CHECK-SM80: .target sm_80
+
+; CHECK-SM80-LOW: LLVM ERROR: PTX version 6.3 does not support target 'sm_80'.
+; CHECK-SM80-LOW: Minimum required PTX version is 7.0.
+
+define void @foo() {
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/rsqrt.ll b/llvm/test/CodeGen/NVPTX/rsqrt.ll
index 0e19dc11021c7..65bcf8d9f404b 100644
--- a/llvm/test/CodeGen/NVPTX/rsqrt.ll
+++ b/llvm/test/CodeGen/NVPTX/rsqrt.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %}
+; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx40 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mattr=+ptx40 | %ptxas-verify %}
 
 ; CHECK-LABEL: .func{{.*}}test1
 define float @test1(float %in) local_unnamed_addr {
diff --git a/llvm/test/CodeGen/NVPTX/sm-version.ll 
b/llvm/test/CodeGen/NVPTX/sm-version.ll
index c90c086e8b96c..620bfebd12037 100644
--- a/llvm/test/CodeGen/NVPTX/sm-version.ll
+++ b/llvm/test/CodeGen/NVPTX/sm-version.ll
@@ -76,7 +76,7 @@
 
 ; SM20: .version 3.2
 ; SM21: .version 3.2
-; SM30: .version 6.0
+; SM30: .version 3.2
 ; SM32: .version 4.0
 ; SM35: .version 3.2
 ; SM37: .version 4.1
diff --git a/llvm/test/CodeGen/NVPTX/surf-tex.py 
b/llvm/test/CodeGen/NVPTX/surf-tex.py
index 799ef8c56417d..dc949b879bd1b 100644
--- a/llvm/test/CodeGen/NVPTX/surf-tex.py
+++ b/llvm/test/CodeGen/NVPTX/surf-tex.py
@@ -1,6 +1,6 @@
 # RUN: %python %s --target=cuda --tests=suld,sust,tex,tld4 --gen-list=%t.list 
> %t-cuda.ll
-# RUN: llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | 
FileCheck %t-cuda.ll
-# RUN: %if ptxas-sm_60 && ptxas-isa-4.3 %{ llc -mcpu=sm_60 -mattr=+ptx43 
%t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify -arch=sm_60 %}
+# RUN: llc -mcpu=sm_60 %t-cuda.ll -verify-machineinstrs -o - | FileCheck 
%t-cuda.ll
+# RUN: %if ptxas-sm_60 %{ llc -mcpu=sm_60 %t-cuda.ll -verify-machineinstrs -o 
- | %ptxas-verify -arch=sm_60 %}
 
 # We only need to run this second time for texture tests, because
 # there is a difference between unified and non-unified intrinsics.
diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py 
b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py
deleted file mode 100644
index 121fa3d8068b1..0000000000000
--- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Check all variants of instructions supported by PTX86 on SM120a
-# RUN: %python %s --ptx=86 --gpu-arch=120a > %t-ptx86-sm_120a.ll
-# RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \
-# RUN:           --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG
-# RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \
-# RUN:           | FileCheck %t-ptx86-sm_120a.ll
-# RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{                                   
               \
-# RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \
-# RUN:           | %ptxas-verify -arch=sm_120a                              \
-# RUN: %}
-
-import wmma
-
-wmma.main()

>From 7ebc1bc28da82e9665b1de8379106e3c2f96c690 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <[email protected]>
Date: Wed, 7 Jan 2026 22:00:19 +0000
Subject: [PATCH 2/4] Address review comments

---
 llvm/lib/Target/NVPTX/NVPTX.td           | 71 ++++----------------
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp | 84 +++++++++++-------------
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h   | 13 +++-
 3 files changed, 64 insertions(+), 104 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index bc2d67c9769dc..80491ac4cc1f8 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -68,9 +68,9 @@ class FeaturePTX<int version>:
 // represents 'z'), sm_103f, and sm_103 architecture variants. The sm_103 is
 // compatible with sm_103a and sm_103f, and sm_103f is compatible with sm_103a.
 //
-// Encoding := Arch * 10 + suffix_offset
+// Encoding := Arch * 10 + ArchSuffixOffset
 // Arch := X * 10 + Y
-// suffix_offset := 0 (base), 2 ('f'), or 3 ('a')
+// ArchSuffixOffset := 0 (base), 2 ('f'), or 3 ('a')
 //
 // For example, sm_103a is encoded as 1033 (103 * 10 + 3) and sm_103f is
 // encoded as 1032 (103 * 10 + 2).
@@ -81,21 +81,27 @@ class FeaturePTX<int version>:
 //  + Compare within the family by comparing FullSMVersion, given both belongs 
to
 //    the same family.
 //  + Detect 'a' variants by checking FullSMVersion & 1.
+class Proc<FeatureSM SM>
+ : Processor<SM.Name, NoItineraries, [SM]>;
+
 foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53, 60,
               61, 62, 70, 72, 75, 80, 86, 87, 88, 89,
               90, 100, 101, 103, 110, 120, 121] in {
   // Base SM version (e.g. FullSMVersion for sm_100 is 1000)
   def SM#sm : FeatureSM<""#sm, !mul(sm, 10)>;
+  def : Proc<!cast<FeatureSM>("SM"#sm)>;
 
-  // Family-specific targets which are compatible within same family
-  // (e.g. FullSMVersion for sm_100f is 1002)
-  if !ge(sm, 100) then
+  // Family-specific variants, compatible within same family (e.g. sm_100f = 
1002)
+  if !ge(sm, 100) then {
     def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 10), 2)>;
+    def : Proc<!cast<FeatureSM>("SM"#sm#"f")>;
+  }
 
-  // Architecture-specific targets which are incompatible across architectures
-  // (e.g. FullSMVersion for sm_100a is 1003)
-  if !ge(sm, 90) then
+  // Architecture-specific variants, incompatible across architectures (e.g. 
sm_100a = 1003)
+  if !ge(sm, 90) then {
     def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 10), 3)>;
+    def : Proc<!cast<FeatureSM>("SM"#sm#"a")>;
+  }
 }
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65, 70, 71, 72,
@@ -103,55 +109,6 @@ foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 
64, 65, 70, 71, 72,
                    90] in
   def PTX#version : FeaturePTX<version>;
 
-//===----------------------------------------------------------------------===//
-// NVPTX supported processors.
-//===----------------------------------------------------------------------===//
-
-class Proc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
-
-def : Proc<"sm_20",   [SM20]>;
-def : Proc<"sm_21",   [SM21]>;
-def : Proc<"sm_30",   [SM30]>;
-def : Proc<"sm_32",   [SM32]>;
-def : Proc<"sm_35",   [SM35]>;
-def : Proc<"sm_37",   [SM37]>;
-def : Proc<"sm_50",   [SM50]>;
-def : Proc<"sm_52",   [SM52]>;
-def : Proc<"sm_53",   [SM53]>;
-def : Proc<"sm_60",   [SM60]>;
-def : Proc<"sm_61",   [SM61]>;
-def : Proc<"sm_62",   [SM62]>;
-def : Proc<"sm_70",   [SM70]>;
-def : Proc<"sm_72",   [SM72]>;
-def : Proc<"sm_75",   [SM75]>;
-def : Proc<"sm_80",   [SM80]>;
-def : Proc<"sm_86",   [SM86]>;
-def : Proc<"sm_87",   [SM87]>;
-def : Proc<"sm_88",   [SM88]>;
-def : Proc<"sm_89",   [SM89]>;
-def : Proc<"sm_90",   [SM90]>;
-def : Proc<"sm_90a",  [SM90a]>;
-def : Proc<"sm_100",  [SM100]>;
-def : Proc<"sm_100a", [SM100a]>;
-def : Proc<"sm_100f", [SM100f]>;
-def : Proc<"sm_101",  [SM101]>;
-def : Proc<"sm_101a", [SM101a]>;
-def : Proc<"sm_101f", [SM101f]>;
-def : Proc<"sm_103",  [SM103]>;
-def : Proc<"sm_103a", [SM103a]>;
-def : Proc<"sm_103f", [SM103f]>;
-def : Proc<"sm_110",  [SM110]>;
-def : Proc<"sm_110a", [SM110a]>;
-def : Proc<"sm_110f", [SM110f]>;
-def : Proc<"sm_120",  [SM120]>;
-def : Proc<"sm_120a", [SM120a]>;
-def : Proc<"sm_120f", [SM120f]>;
-def : Proc<"sm_121",  [SM121]>;
-def : Proc<"sm_121a", [SM121a]>;
-def : Proc<"sm_121f", [SM121f]>;
-
-
 def Is64Bit : Predicate<"Subtarget->getTargetTriple().getArch() == 
Triple::nvptx64">;
 def NVPTX64 : HwMode<[Is64Bit]>;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp 
b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 00b21c0ef21ea..456265061ad65 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -46,68 +46,64 @@ void NVPTXSubtarget::anchor() {}
 // Note: LLVM's minimum supported PTX version is 3.2 (see FeaturePTX in
 // NVPTX.td), so older SMs that supported earlier PTX versions instead use 3.2
 // as their effective minimum.
-//
-// The FullSmVersion encoding is: SM * 10 + suffix_offset
-// where suffix_offset is: 0 (base), 2 ('f' suffix), or 3 ('a' suffix)
-// For example: sm_100 = 1000, sm_100f = 1002, sm_100a = 1003
 static unsigned getMinPTXVersionForSM(unsigned FullSmVersion) {
   switch (FullSmVersion) {
-  case 200: // sm_20
-  case 210: // sm_21
-  case 300: // sm_30
-  case 350: // sm_35
+  case SM(20):
+  case SM(21):
+  case SM(30):
+  case SM(35):
     return 32;
-  case 320: // sm_32
-  case 500: // sm_50
+  case SM(32):
+  case SM(50):
     return 40;
-  case 370: // sm_37
-  case 520: // sm_52
+  case SM(37):
+  case SM(52):
     return 41;
-  case 530: // sm_53
+  case SM(53):
     return 42;
-  case 600: // sm_60
-  case 610: // sm_61
-  case 620: // sm_62
+  case SM(60):
+  case SM(61):
+  case SM(62):
     return 50;
-  case 700: // sm_70
+  case SM(70):
     return 60;
-  case 720: // sm_72
+  case SM(72):
     return 61;
-  case 750: // sm_75
+  case SM(75):
     return 63;
-  case 800: // sm_80
+  case SM(80):
     return 70;
-  case 860: // sm_86
+  case SM(86):
     return 71;
-  case 870: // sm_87
+  case SM(87):
     return 74;
-  case 890: // sm_89
-  case 900: // sm_90
+  case SM(89):
+  case SM(90):
     return 78;
-  case 903: // sm_90a
+  case SMA(90):
     return 80;
-  case 1000: // sm_100
-  case 1003: // sm_100a
-  case 1010: // sm_101
-  case 1013: // sm_101a
+  case SM(100):
+  case SMA(100):
+  case SM(101):
+  case SMA(101):
     return 86;
-  case 1200: // sm_120
-  case 1203: // sm_120a
+  case SM(120):
+  case SMA(120):
     return 87;
-  case 1002: // sm_100f
-  case 1012: // sm_101f
-  case 1030: // sm_103
-  case 1032: // sm_103f
-  case 1033: // sm_103a
-  case 1202: // sm_120f
-  case 1210: // sm_121
-  case 1212: // sm_121f
-  case 1213: // sm_121a
+  case SMF(100):
+  case SMF(101):
+  case SM(103):
+  case SMF(103):
+  case SMA(103):
+  case SMF(120):
+  case SM(121):
+  case SMF(121):
+  case SMA(121):
     return 88;
-  case 880:  // sm_88
-  case 1100: // sm_110
-  case 1102: // sm_110f
-  case 1103: // sm_110a
+  case SM(88):
+  case SM(110):
+  case SMF(110):
+  case SMA(110):
     return 90;
   default:
     llvm_unreachable("Unknown SM version");
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h 
b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index b6666ae429f44..9d70fcbc9f5f0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -29,6 +29,13 @@
 
 namespace llvm {
 
+// FullSmVersion encoding: SM * 10 + ArchSuffixOffset
+// ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
+// e.g. SM(100)=1000 (sm_100), SMF(100)=1002 (sm_100f), SMA(100)=1003 (sm_100a)
+inline constexpr unsigned SM(unsigned Version) { return Version * 10; }
+inline constexpr unsigned SMF(unsigned Version) { return SM(Version) + 2; }
+inline constexpr unsigned SMA(unsigned Version) { return SM(Version) + 3; }
+
 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   virtual void anchor();
   std::string TargetName;
@@ -36,9 +43,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
   unsigned PTXVersion;
 
-  // FullSmVersion is SM * 10 + suffix_offset
-  // where suffix_offset is: 0 (base), 2 ('f'), or 3 ('a')
-  // e.g. sm_30 == 300, sm_90a == 903, sm_100f == 1002
+  // FullSmVersion encoding: SM * 10 + ArchSuffixOffset
+  // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
+  // e.g. SM(30) == 300, SMA(90) == 903, SMF(100) == 1002
   unsigned int FullSmVersion;
 
   // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from

>From bf80f65dd14697ebf60288a6cc9744c43d214d93 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <[email protected]>
Date: Thu, 8 Jan 2026 00:16:27 +0000
Subject: [PATCH 3/4] Fix failing clang + mlir tests

---
 clang/lib/Basic/Targets/NVPTX.cpp                    | 4 +++-
 clang/lib/Basic/Targets/NVPTX.h                      | 5 ++++-
 clang/test/CodeGen/builtins-nvptx-ptx60.cu           | 2 +-
 clang/test/CodeGen/builtins-nvptx.c                  | 4 ++--
 clang/test/CodeGen/nvptx_attributes.c                | 2 +-
 clang/test/CodeGenCUDA/convergent.cu                 | 8 ++++----
 clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp | 2 +-
 mlir/include/mlir/Dialect/GPU/Transforms/Passes.td   | 2 +-
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td          | 4 ++--
 9 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index 06db3aae0c755..3be0fe9f30c1e 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -42,7 +42,9 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
   assert((TargetPointerWidth == 32 || TargetPointerWidth == 64) &&
          "NVPTX only supports 32- and 64-bit modes.");
 
-  PTXVersion = 32;
+  // PTXVersion is 0 by default, meaning "use the minimum for the SM target".
+  // Only set it if the user explicitly requested a PTX version.
+  PTXVersion = 0;
   for (const StringRef Feature : Opts.FeaturesAsWritten) {
     int PTXV;
     if (!Feature.starts_with("+ptx") ||
diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h
index 6338a4f2f9036..9bd0cc36d12b4 100644
--- a/clang/lib/Basic/Targets/NVPTX.h
+++ b/clang/lib/Basic/Targets/NVPTX.h
@@ -89,7 +89,10 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public 
TargetInfo {
                  const std::vector<std::string> &FeaturesVec) const override {
     if (GPU != OffloadArch::UNUSED)
       Features[OffloadArchToString(GPU)] = true;
-    Features["ptx" + std::to_string(PTXVersion)] = true;
+    // Only add PTX feature if explicitly requested. Otherwise, let the backend
+    // use the minimum required PTX version for the target SM.
+    if (PTXVersion != 0)
+      Features["ptx" + std::to_string(PTXVersion)] = true;
     return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
   }
 
diff --git a/clang/test/CodeGen/builtins-nvptx-ptx60.cu 
b/clang/test/CodeGen/builtins-nvptx-ptx60.cu
index 8b2514a183221..04d391a10115c 100644
--- a/clang/test/CodeGen/builtins-nvptx-ptx60.cu
+++ b/clang/test/CodeGen/builtins-nvptx-ptx60.cu
@@ -3,7 +3,7 @@
 // RUN:            -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK %s
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_80 \
-// RUN:            -fcuda-is-device -target-feature +ptx65 \
+// RUN:            -fcuda-is-device -target-feature +ptx70 \
 // RUN:            -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK %s
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_80 \
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index cd1447374d000..470a27a60bbe7 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -43,10 +43,10 @@
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_101a -target-feature +ptx86 -DPTX=86 \
 // RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x 
cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM101a %s
-// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \
 // RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x 
cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
-// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_103a -target-feature +ptx87 -DPTX=87 \
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_103a -target-feature +ptx88 -DPTX=88 \
 // RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x 
cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX87_SM103a %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_100a -target-feature +ptx87 -DPTX=87 \
diff --git a/clang/test/CodeGen/nvptx_attributes.c 
b/clang/test/CodeGen/nvptx_attributes.c
index 8b9f3a2c18a1d..4695fca51ea53 100644
--- a/clang/test/CodeGen/nvptx_attributes.c
+++ b/clang/test/CodeGen/nvptx_attributes.c
@@ -16,7 +16,7 @@ __attribute__((nvptx_kernel)) void foo(int *ret) {
 }
 
 //.
-// CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind optnone 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-cpu"="sm_61" "target-features"="+ptx32,+sm_61" }
+// CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind optnone 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-cpu"="sm_61" "target-features"="+sm_61" }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
diff --git a/clang/test/CodeGenCUDA/convergent.cu 
b/clang/test/CodeGenCUDA/convergent.cu
index b187f3a8a32d6..87948235f736e 100644
--- a/clang/test/CodeGenCUDA/convergent.cu
+++ b/clang/test/CodeGenCUDA/convergent.cu
@@ -71,10 +71,10 @@ __host__ __device__ void bar() {
 
 
 //.
-// DEVICE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind 
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-features"="+ptx32" }
-// DEVICE: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-features"="+ptx32" }
-// DEVICE: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-features"="+ptx32" }
-// DEVICE: attributes #[[ATTR3:[0-9]+]] = { nounwind "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind 
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// DEVICE: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// DEVICE: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// DEVICE: attributes #[[ATTR3:[0-9]+]] = { nounwind "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" }
 // DEVICE: attributes #[[ATTR4]] = { convergent nounwind }
 // DEVICE: attributes #[[ATTR5]] = { convergent nounwind memory(none) }
 // DEVICE: attributes #[[ATTR6]] = { nounwind }
diff --git a/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp 
b/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp
index cd1d4d801951d..67b53f3ae81cf 100644
--- a/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp
+++ b/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp
@@ -182,7 +182,7 @@ int main() {
 // CHECK-AMDGCN: #[[AMDGCN_ATTR0]] = { convergent mustprogress noinline 
norecurse nounwind optnone "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" }
 // CHECK-AMDGCN: #[[AMDGCN_ATTR1]] = { convergent nounwind }
 //
-// CHECK-NVPTX: #[[NVPTX_ATTR0]] = { convergent mustprogress noinline 
norecurse nounwind optnone "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// CHECK-NVPTX: #[[NVPTX_ATTR0]] = { convergent mustprogress noinline 
norecurse nounwind optnone "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" }
 // CHECK-NVPTX: #[[NVPTX_ATTR1]] = { convergent nounwind }
 //
 // CHECK-SPIR: #[[SPIR_ATTR0]] = { convergent mustprogress noinline norecurse 
nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td 
b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index 0c8a0c7a677ab..9dbaef9d9b640 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -146,7 +146,7 @@ def GpuNVVMAttachTarget: Pass<"nvvm-attach-target", ""> {
            /*default=*/"\"sm_50\"",
            "Target chip.">,
     Option<"features", "features", "std::string",
-           /*default=*/"\"+ptx60\"",
+           /*default=*/"\"\"",
            "Target features.">,
     Option<"optLevel", "O", "unsigned",
            /*default=*/"2",
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td 
b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 853b2800bc0ff..18fbbbef777a6 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -6185,7 +6185,7 @@ def NVVM_TargetAttr : NVVM_Attr<"NVVMTarget", "target",
     DefaultValuedParameter<"int", "2", "Optimization level to apply.">:$O,
     StringRefParameter<"Target triple.", "\"nvptx64-nvidia-cuda\"">:$triple,
     StringRefParameter<"Target chip.", "\"sm_50\"">:$chip,
-    StringRefParameter<"Target chip features.", "\"+ptx60\"">:$features,
+    StringRefParameter<"Target chip features.", "\"\"">:$features,
     OptionalParameter<"DictionaryAttr", "Target specific flags.">:$flags,
     OptionalParameter<"ArrayAttr", "Files to link to the LLVM module.">:$link,
     DefaultValuedParameter<"bool", "true", "Perform SM version check on 
Ops.">:$verifyTarget
@@ -6197,7 +6197,7 @@ def NVVM_TargetAttr : NVVM_Attr<"NVVMTarget", "target",
     AttrBuilder<(ins CArg<"int", "2">:$optLevel,
                      CArg<"StringRef", "\"nvptx64-nvidia-cuda\"">:$triple,
                      CArg<"StringRef", "\"sm_50\"">:$chip,
-                     CArg<"StringRef", "\"+ptx60\"">:$features,
+                     CArg<"StringRef", "\"\"">:$features,
                      CArg<"DictionaryAttr", "nullptr">:$targetFlags,
                      CArg<"ArrayAttr", "nullptr">:$linkFiles,
                      CArg<"bool", "true">:$verifyTarget), [{

>From 9ea5f905b640d6ae58879a92871df23b9b9dddad Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <[email protected]>
Date: Thu, 8 Jan 2026 20:52:16 +0000
Subject: [PATCH 4/4] Fixup rebase | Attempt flang fix

---
 flang/lib/Frontend/CompilerInstance.cpp           | 15 +++------------
 flang/test/Lower/OpenMP/target_cpu_features.f90   |  2 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp          |  8 +++++++-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h            |  7 ++-----
 .../NVPTX/clusterlaunchcontrol-multicast.ll       |  8 ++++----
 5 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/flang/lib/Frontend/CompilerInstance.cpp 
b/flang/lib/Frontend/CompilerInstance.cpp
index 851cd1f47afd2..5448293584d47 100644
--- a/flang/lib/Frontend/CompilerInstance.cpp
+++ b/flang/lib/Frontend/CompilerInstance.cpp
@@ -288,25 +288,16 @@ 
getExplicitAndImplicitNVPTXTargetFeatures(clang::DiagnosticsEngine &diags,
                                           const llvm::Triple triple) {
   llvm::StringRef cpu = targetOpts.cpu;
   llvm::StringMap<bool> implicitFeaturesMap;
-  std::string errorMsg;
-  bool ptxVer = false;
 
   // Add target features specified by the user
   for (auto &userFeature : targetOpts.featuresAsWritten) {
     llvm::StringRef userKeyString(llvm::StringRef(userFeature).drop_front(1));
     implicitFeaturesMap[userKeyString.str()] = (userFeature[0] == '+');
-    // Check if the user provided a PTX version
-    if (userKeyString.starts_with("ptx"))
-      ptxVer = true;
   }
 
-  // Set the default PTX version to `ptx61` if none was provided.
-  // TODO: set the default PTX version based on the chip.
-  if (!ptxVer)
-    implicitFeaturesMap["ptx61"] = true;
-
-  // Set the compute capability.
-  implicitFeaturesMap[cpu.str()] = true;
+  // Set the compute capability (only if one was explicitly provided).
+  if (!cpu.empty())
+    implicitFeaturesMap[cpu.str()] = true;
 
   llvm::SmallVector<std::string> featuresVec;
   for (auto &implicitFeatureItem : implicitFeaturesMap) {
diff --git a/flang/test/Lower/OpenMP/target_cpu_features.f90 
b/flang/test/Lower/OpenMP/target_cpu_features.f90
index 341cfc7991d43..78f29b23068af 100644
--- a/flang/test/Lower/OpenMP/target_cpu_features.f90
+++ b/flang/test/Lower/OpenMP/target_cpu_features.f90
@@ -16,4 +16,4 @@
 
 !NVPTX: module attributes {
 !NVPTX-SAME: fir.target_cpu = "sm_80"
-!NVPTX-SAME: fir.target_features = #llvm.target_features<["+ptx61", "+sm_80"]>
+!NVPTX-SAME: fir.target_features = #llvm.target_features<["+sm_80"]>
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp 
b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 456265061ad65..22077e19a9527 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -35,6 +35,12 @@ static cl::opt<bool> NoF32x2("nvptx-no-f32x2", cl::Hidden,
                                       "f32x2 instructions and registers."),
                              cl::init(false));
 
+// FullSmVersion encoding helpers: SM * 10 + suffix offset
+// (0 = base, 2 = 'f', 3 = 'a').
+static constexpr unsigned SM(unsigned Version) { return Version * 10; }
+static constexpr unsigned SMF(unsigned Version) { return SM(Version) + 2; }
+static constexpr unsigned SMA(unsigned Version) { return SM(Version) + 3; }
+
 // Pin the vtable to this file.
 void NVPTXSubtarget::anchor() {}
 
@@ -128,7 +134,7 @@ NVPTXSubtarget 
&NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
     PTXVersion = MinPTX;
   } else if (PTXVersion < MinPTX) {
     // User explicitly requested an insufficient PTX version.
-    report_fatal_error(formatv(
+    reportFatalUsageError(formatv(
         "PTX version {0}.{1} does not support target '{2}'. "
         "Minimum required PTX version is {3}.{4}. "
         "Either remove the PTX version to use the default, "
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h 
b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 9d70fcbc9f5f0..6b77ae5abfa9f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -31,10 +31,7 @@ namespace llvm {
 
 // FullSmVersion encoding: SM * 10 + ArchSuffixOffset
 // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
-// e.g. SM(100)=1000 (sm_100), SMF(100)=1002 (sm_100f), SMA(100)=1003 (sm_100a)
-inline constexpr unsigned SM(unsigned Version) { return Version * 10; }
-inline constexpr unsigned SMF(unsigned Version) { return SM(Version) + 2; }
-inline constexpr unsigned SMA(unsigned Version) { return SM(Version) + 3; }
+// e.g. sm_100 -> 1000, sm_100f -> 1002, sm_100a -> 1003
 
 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   virtual void anchor();
@@ -45,7 +42,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
 
   // FullSmVersion encoding: SM * 10 + ArchSuffixOffset
   // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
-  // e.g. SM(30) == 300, SMA(90) == 903, SMF(100) == 1002
+  // e.g. sm_30 -> 300, sm_90a -> 903, sm_100f -> 1002
   unsigned int FullSmVersion;
 
   // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
diff --git a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll 
b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll
index 9e6beda9b64aa..c115cc546df28 100644
--- a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll
+++ b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll
@@ -19,10 +19,10 @@
 ; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 ; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_110f -mattr=+ptx90 --nvptx-short-ptr | %ptxas-verify -arch=sm_110f %}
 
-; RUN: llc -o - -mcpu=sm_120a -march=nvptx64 -mattr=+ptx86 %s | FileCheck %s 
--check-prefixes=CHECK,CHECK-PTX-SHARED64
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 --nvptx-short-ptr 
| FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s
-; RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %}
-; RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_120a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_120a %}
+; RUN: llc -o - -mcpu=sm_120a -march=nvptx64 -mattr=+ptx87 %s | FileCheck %s 
--check-prefixes=CHECK,CHECK-PTX-SHARED64
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx87 --nvptx-short-ptr 
| FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-sm_120a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_120a -mattr=+ptx87 | %ptxas-verify -arch=sm_120a %}
+; RUN: %if ptxas-sm_120a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 
-mcpu=sm_120a -mattr=+ptx87 --nvptx-short-ptr | %ptxas-verify -arch=sm_120a %}
 
 ; RUN: llc -o - -mcpu=sm_120f -march=nvptx64 -mattr=+ptx88 %s | FileCheck %s 
--check-prefixes=CHECK,CHECK-PTX-SHARED64
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120f -mattr=+ptx88 --nvptx-short-ptr 
| FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to