Author: Justin Fargnoli Date: 2026-02-05T17:42:41Z New Revision: ca93dd399eec6436bebe4c6d727ff6af0ab7b7d4
URL: https://github.com/llvm/llvm-project/commit/ca93dd399eec6436bebe4c6d727ff6af0ab7b7d4 DIFF: https://github.com/llvm/llvm-project/commit/ca93dd399eec6436bebe4c6d727ff6af0ab7b7d4.diff LOG: Reland "[NVPTX] Validate user-specified PTX version against SM version" (#179304) Previous commit message: > Original commit message: > >>When users explicitly specify a PTX version via -mattr=+ptxNN that's insufficient for their target SM, we now emit a fatal error. Previously, we silently upgraded the PTX version to the minimum required for the target SM. >> >>When no SM or PTX version is specified, we now use PTX 3.2 (the minimum for the default SM 3.0) instead of PTX 6.0. > >The following commits should fix the failures that arose when I previously tried to land this commit: > >https://github.com/llvm/llvm-project/commit/9fc5fd0ad689eed94f65b1d6d10f9c5642935e68 should address the llvm-nvptx*-nvidia-* build failures: https://github.com/llvm/llvm-project/pull/174834#issuecomment-3742242651 > >https://github.com/llvm/llvm-project/commit/600514a63760c6730e4cd970d2fcead9c5a897b3 should address the MLIR failures --- The previous commit was reverted with d23cb79ba497281de050ef609cb91b91058bf323 because the [mlir-nvidia](https://lab.llvm.org/buildbot/#/builders/138/builds/24797) and [mlir-nvidia-gcc7](https://lab.llvm.org/buildbot/#/builders/116/builds/23929) Buildbots were failing. Those tests failed because MLIR's default SM was 5.0, which caused NVPTX to target PTX ISA v4.0, which did not support the intrinsics used in the failing tests. 243f011577193c99358ccc4142b296d4fa80ea11 should address this by bumping MLIR's default SM to 7.5. Now, using MLIR's new default SM, NVPTX targets the PTX ISA v6.3, which supports the intrinsics used in the failing tests. Added: llvm/test/CodeGen/NVPTX/ptx-version-validation.ll Modified: clang/lib/Basic/Targets/NVPTX.cpp clang/lib/Basic/Targets/NVPTX.h clang/test/CodeGen/builtins-nvptx-ptx60.cu clang/test/CodeGen/builtins-nvptx.c clang/test/CodeGen/nvptx_attributes.c clang/test/CodeGenCUDA/convergent.cu clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp clang/test/Headers/gpuintrin.c flang/lib/Frontend/CompilerInstance.cpp flang/test/Lower/OpenMP/target_cpu_features.f90 llvm/lib/Target/NVPTX/NVPTX.td llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp llvm/lib/Target/NVPTX/NVPTXSubtarget.h llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll llvm/test/CodeGen/NVPTX/convert-sm100a.ll llvm/test/CodeGen/NVPTX/convert-sm103a.ll llvm/test/CodeGen/NVPTX/f32-ex2.ll llvm/test/CodeGen/NVPTX/fexp2.ll llvm/test/CodeGen/NVPTX/flog2.ll llvm/test/CodeGen/NVPTX/i128.ll llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll llvm/test/CodeGen/NVPTX/rsqrt.ll llvm/test/CodeGen/NVPTX/sm-version.ll llvm/test/CodeGen/NVPTX/surf-tex.py mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h mlir/include/mlir/Dialect/GPU/Transforms/Passes.td mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td Removed: llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py ################################################################################ diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index dec076ac54f41..6526eeff3d718 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -42,7 +42,9 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple, assert((TargetPointerWidth == 32 || TargetPointerWidth == 64) && "NVPTX only supports 32- and 64-bit modes."); - PTXVersion = 32; + // PTXVersion is 0 by default, meaning "use the minimum for the SM target". + // Only set it if the user explicitly requested a PTX version. + PTXVersion = 0; for (const StringRef Feature : Opts.FeaturesAsWritten) { int PTXV; if (!Feature.starts_with("+ptx") || diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h index 6338a4f2f9036..9bd0cc36d12b4 100644 --- a/clang/lib/Basic/Targets/NVPTX.h +++ b/clang/lib/Basic/Targets/NVPTX.h @@ -89,7 +89,10 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo { const std::vector<std::string> &FeaturesVec) const override { if (GPU != OffloadArch::UNUSED) Features[OffloadArchToString(GPU)] = true; - Features["ptx" + std::to_string(PTXVersion)] = true; + // Only add PTX feature if explicitly requested. Otherwise, let the backend + // use the minimum required PTX version for the target SM. + if (PTXVersion != 0) + Features["ptx" + std::to_string(PTXVersion)] = true; return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec); } diff --git a/clang/test/CodeGen/builtins-nvptx-ptx60.cu b/clang/test/CodeGen/builtins-nvptx-ptx60.cu index 8b2514a183221..04d391a10115c 100644 --- a/clang/test/CodeGen/builtins-nvptx-ptx60.cu +++ b/clang/test/CodeGen/builtins-nvptx-ptx60.cu @@ -3,7 +3,7 @@ // RUN: -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK %s // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_80 \ -// RUN: -fcuda-is-device -target-feature +ptx65 \ +// RUN: -fcuda-is-device -target-feature +ptx70 \ // RUN: -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK %s // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_80 \ diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index a739b66042f19..2e1acc0aac259 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -46,10 +46,10 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_101a -target-feature +ptx86 -DPTX=86 \ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM101a %s -// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \ +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s -// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_103a -target-feature +ptx87 -DPTX=87 \ +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_103a -target-feature +ptx88 -DPTX=88 \ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX87_SM103a %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 -DPTX=87 \ diff --git a/clang/test/CodeGen/nvptx_attributes.c b/clang/test/CodeGen/nvptx_attributes.c index 8b9f3a2c18a1d..4695fca51ea53 100644 --- a/clang/test/CodeGen/nvptx_attributes.c +++ b/clang/test/CodeGen/nvptx_attributes.c @@ -16,7 +16,7 @@ __attribute__((nvptx_kernel)) void foo(int *ret) { } //. -// CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx32,+sm_61" } +// CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+sm_61" } //. // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} diff --git a/clang/test/CodeGenCUDA/convergent.cu b/clang/test/CodeGenCUDA/convergent.cu index b187f3a8a32d6..87948235f736e 100644 --- a/clang/test/CodeGenCUDA/convergent.cu +++ b/clang/test/CodeGenCUDA/convergent.cu @@ -71,10 +71,10 @@ __host__ __device__ void bar() { //. -// DEVICE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" } -// DEVICE: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" } -// DEVICE: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" } -// DEVICE: attributes #[[ATTR3:[0-9]+]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" } +// DEVICE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// DEVICE: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// DEVICE: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// DEVICE: attributes #[[ATTR3:[0-9]+]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // DEVICE: attributes #[[ATTR4]] = { convergent nounwind } // DEVICE: attributes #[[ATTR5]] = { convergent nounwind memory(none) } // DEVICE: attributes #[[ATTR6]] = { nounwind } diff --git a/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp b/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp index cd1d4d801951d..67b53f3ae81cf 100644 --- a/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp +++ b/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp @@ -182,7 +182,7 @@ int main() { // CHECK-AMDGCN: #[[AMDGCN_ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // CHECK-AMDGCN: #[[AMDGCN_ATTR1]] = { convergent nounwind } // -// CHECK-NVPTX: #[[NVPTX_ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" } +// CHECK-NVPTX: #[[NVPTX_ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // CHECK-NVPTX: #[[NVPTX_ATTR1]] = { convergent nounwind } // // CHECK-SPIR: #[[SPIR_ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c index a9b9889b280ae..565481ab0a971 100644 --- a/clang/test/Headers/gpuintrin.c +++ b/clang/test/Headers/gpuintrin.c @@ -5,8 +5,8 @@ // RUN: | FileCheck %s --check-prefix=AMDGPU // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ // RUN: -internal-isystem %S/../../lib/Headers/ \ -// RUN: -target-feature +ptx62 \ -// RUN: -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \ +// RUN: -triple nvptx64-nvidia-cuda -target-feature +ptx63 \ +// RUN: -emit-llvm %s -o - \ // RUN: | FileCheck %s --check-prefix=NVPTX // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ // RUN: -internal-isystem %S/../../lib/Headers/ \ diff --git a/flang/lib/Frontend/CompilerInstance.cpp b/flang/lib/Frontend/CompilerInstance.cpp index 851cd1f47afd2..5448293584d47 100644 --- a/flang/lib/Frontend/CompilerInstance.cpp +++ b/flang/lib/Frontend/CompilerInstance.cpp @@ -288,25 +288,16 @@ getExplicitAndImplicitNVPTXTargetFeatures(clang::DiagnosticsEngine &diags, const llvm::Triple triple) { llvm::StringRef cpu = targetOpts.cpu; llvm::StringMap<bool> implicitFeaturesMap; - std::string errorMsg; - bool ptxVer = false; // Add target features specified by the user for (auto &userFeature : targetOpts.featuresAsWritten) { llvm::StringRef userKeyString(llvm::StringRef(userFeature).drop_front(1)); implicitFeaturesMap[userKeyString.str()] = (userFeature[0] == '+'); - // Check if the user provided a PTX version - if (userKeyString.starts_with("ptx")) - ptxVer = true; } - // Set the default PTX version to `ptx61` if none was provided. - // TODO: set the default PTX version based on the chip. - if (!ptxVer) - implicitFeaturesMap["ptx61"] = true; - - // Set the compute capability. - implicitFeaturesMap[cpu.str()] = true; + // Set the compute capability (only if one was explicitly provided). + if (!cpu.empty()) + implicitFeaturesMap[cpu.str()] = true; llvm::SmallVector<std::string> featuresVec; for (auto &implicitFeatureItem : implicitFeaturesMap) { diff --git a/flang/test/Lower/OpenMP/target_cpu_features.f90 b/flang/test/Lower/OpenMP/target_cpu_features.f90 index 341cfc7991d43..78f29b23068af 100644 --- a/flang/test/Lower/OpenMP/target_cpu_features.f90 +++ b/flang/test/Lower/OpenMP/target_cpu_features.f90 @@ -16,4 +16,4 @@ !NVPTX: module attributes { !NVPTX-SAME: fir.target_cpu = "sm_80" -!NVPTX-SAME: fir.target_features = #llvm.target_features<["+ptx61", "+sm_80"]> +!NVPTX-SAME: fir.target_features = #llvm.target_features<["+sm_80"]> diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td index d41a43de95098..80491ac4cc1f8 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.td +++ b/llvm/lib/Target/NVPTX/NVPTX.td @@ -68,10 +68,11 @@ class FeaturePTX<int version>: // represents 'z'), sm_103f, and sm_103 architecture variants. The sm_103 is // compatible with sm_103a and sm_103f, and sm_103f is compatible with sm_103a. // -// Encoding := Arch * 10 + 2 (for 'f') + 1 (for 'a') +// Encoding := Arch * 10 + ArchSuffixOffset // Arch := X * 10 + Y +// ArchSuffixOffset := 0 (base), 2 ('f'), or 3 ('a') // -// For example, sm_103a is encoded as 1033 (103 * 10 + 2 + 1) and sm_103f is +// For example, sm_103a is encoded as 1033 (103 * 10 + 3) and sm_103f is // encoded as 1032 (103 * 10 + 2). // // This encoding allows simple partial ordering of the architectures. @@ -80,21 +81,27 @@ class FeaturePTX<int version>: // + Compare within the family by comparing FullSMVersion, given both belongs to // the same family. // + Detect 'a' variants by checking FullSMVersion & 1. +class Proc<FeatureSM SM> + : Processor<SM.Name, NoItineraries, [SM]>; + foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80, 86, 87, 88, 89, 90, 100, 101, 103, 110, 120, 121] in { // Base SM version (e.g. FullSMVersion for sm_100 is 1000) def SM#sm : FeatureSM<""#sm, !mul(sm, 10)>; + def : Proc<!cast<FeatureSM>("SM"#sm)>; - // Family-specific targets which are compatible within same family - // (e.g. FullSMVersion for sm_100f is 1002) - if !ge(sm, 100) then + // Family-specific variants, compatible within same family (e.g. sm_100f = 1002) + if !ge(sm, 100) then { def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 10), 2)>; + def : Proc<!cast<FeatureSM>("SM"#sm#"f")>; + } - // Architecture-specific targets which are incompatible across architectures - // (e.g. FullSMVersion for sm_100a is 1003) - if !ge(sm, 90) then + // Architecture-specific variants, incompatible across architectures (e.g. sm_100a = 1003) + if !ge(sm, 90) then { def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 10), 3)>; + def : Proc<!cast<FeatureSM>("SM"#sm#"a")>; + } } foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65, 70, 71, 72, @@ -102,55 +109,6 @@ foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65, 70, 71, 72, 90] in def PTX#version : FeaturePTX<version>; -//===----------------------------------------------------------------------===// -// NVPTX supported processors. -//===----------------------------------------------------------------------===// - -class Proc<string Name, list<SubtargetFeature> Features> - : Processor<Name, NoItineraries, Features>; - -def : Proc<"sm_20", [SM20, PTX32]>; -def : Proc<"sm_21", [SM21, PTX32]>; -def : Proc<"sm_30", [SM30]>; -def : Proc<"sm_32", [SM32, PTX40]>; -def : Proc<"sm_35", [SM35, PTX32]>; -def : Proc<"sm_37", [SM37, PTX41]>; -def : Proc<"sm_50", [SM50, PTX40]>; -def : Proc<"sm_52", [SM52, PTX41]>; -def : Proc<"sm_53", [SM53, PTX42]>; -def : Proc<"sm_60", [SM60, PTX50]>; -def : Proc<"sm_61", [SM61, PTX50]>; -def : Proc<"sm_62", [SM62, PTX50]>; -def : Proc<"sm_70", [SM70, PTX60]>; -def : Proc<"sm_72", [SM72, PTX61]>; -def : Proc<"sm_75", [SM75, PTX63]>; -def : Proc<"sm_80", [SM80, PTX70]>; -def : Proc<"sm_86", [SM86, PTX71]>; -def : Proc<"sm_87", [SM87, PTX74]>; -def : Proc<"sm_88", [SM88, PTX90]>; -def : Proc<"sm_89", [SM89, PTX78]>; -def : Proc<"sm_90", [SM90, PTX78]>; -def : Proc<"sm_90a", [SM90a, PTX80]>; -def : Proc<"sm_100", [SM100, PTX86]>; -def : Proc<"sm_100a", [SM100a, PTX86]>; -def : Proc<"sm_100f", [SM100f, PTX88]>; -def : Proc<"sm_101", [SM101, PTX86]>; -def : Proc<"sm_101a", [SM101a, PTX86]>; -def : Proc<"sm_101f", [SM101f, PTX88]>; -def : Proc<"sm_103", [SM103, PTX88]>; -def : Proc<"sm_103a", [SM103a, PTX88]>; -def : Proc<"sm_103f", [SM103f, PTX88]>; -def : Proc<"sm_110", [SM110, PTX90]>; -def : Proc<"sm_110a", [SM110a, PTX90]>; -def : Proc<"sm_110f", [SM110f, PTX90]>; -def : Proc<"sm_120", [SM120, PTX87]>; -def : Proc<"sm_120a", [SM120a, PTX87]>; -def : Proc<"sm_120f", [SM120f, PTX88]>; -def : Proc<"sm_121", [SM121, PTX88]>; -def : Proc<"sm_121a", [SM121a, PTX88]>; -def : Proc<"sm_121f", [SM121f, PTX88]>; - - def Is64Bit : Predicate<"Subtarget->getTargetTriple().getArch() == Triple::nvptx64">; def NVPTX64 : HwMode<[Is64Bit]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index 989be50d45554..bf3c78d3606bf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -35,9 +35,87 @@ static cl::opt<bool> NoF32x2("nvptx-no-f32x2", cl::Hidden, "f32x2 instructions and registers."), cl::init(false)); +// FullSmVersion encoding helpers: SM * 10 + suffix offset +// (0 = base, 2 = 'f', 3 = 'a'). +static constexpr unsigned SM(unsigned Version) { return Version * 10; } +static constexpr unsigned SMF(unsigned Version) { return SM(Version) + 2; } +static constexpr unsigned SMA(unsigned Version) { return SM(Version) + 3; } + // Pin the vtable to this file. void NVPTXSubtarget::anchor() {} +// Returns the minimum PTX version required for a given SM target. +// This must be kept in sync with the "Supported Targets" column of the +// "PTX Release History" table in the PTX ISA documentation: +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes-ptx-release-history +// +// Note: LLVM's minimum supported PTX version is 3.2 (see FeaturePTX in +// NVPTX.td), so older SMs that supported earlier PTX versions instead use 3.2 +// as their effective minimum. +static unsigned getMinPTXVersionForSM(unsigned FullSmVersion) { + switch (FullSmVersion) { + case SM(20): + case SM(21): + case SM(30): + case SM(35): + return 32; + case SM(32): + case SM(50): + return 40; + case SM(37): + case SM(52): + return 41; + case SM(53): + return 42; + case SM(60): + case SM(61): + case SM(62): + return 50; + case SM(70): + return 60; + case SM(72): + return 61; + case SM(75): + return 63; + case SM(80): + return 70; + case SM(86): + return 71; + case SM(87): + return 74; + case SM(89): + case SM(90): + return 78; + case SMA(90): + return 80; + case SM(100): + case SMA(100): + case SM(101): + case SMA(101): + return 86; + case SM(120): + case SMA(120): + return 87; + case SMF(100): + case SMF(101): + case SM(103): + case SMF(103): + case SMA(103): + case SMF(120): + case SM(121): + case SMF(121): + case SMA(121): + return 88; + case SM(88): + case SM(110): + case SMF(110): + case SMA(110): + return 90; + default: + llvm_unreachable("Unknown SM version"); + } +} + NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { TargetName = std::string(CPU); @@ -49,9 +127,20 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU, // sm_90a, which would *not* be a subset of sm_91. SmVersion = getSmVersion(); - // Set default to PTX 6.0 (CUDA 9.0) + unsigned MinPTX = getMinPTXVersionForSM(FullSmVersion); + if (PTXVersion == 0) { - PTXVersion = 60; + // User didn't request a specific PTX version; use the minimum for this SM. + PTXVersion = MinPTX; + } else if (PTXVersion < MinPTX) { + // User explicitly requested an insufficient PTX version. + reportFatalUsageError( + formatv("PTX version {0}.{1} does not support target '{2}'. " + "Minimum required PTX version is {3}.{4}. " + "Either remove the PTX version to use the default, " + "or increase it to at least {3}.{4}.", + PTXVersion / 10, PTXVersion % 10, getTargetName(), MinPTX / 10, + MinPTX % 10)); } return *this; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index aeface20f07f3..01f1680e9c63d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -29,6 +29,10 @@ namespace llvm { +// FullSmVersion encoding: SM * 10 + ArchSuffixOffset +// ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a') +// e.g. sm_100 -> 1000, sm_100f -> 1002, sm_100a -> 1003 + class NVPTXSubtarget : public NVPTXGenSubtargetInfo { virtual void anchor(); std::string TargetName; @@ -36,8 +40,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31 unsigned PTXVersion; - // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310 - // sm_90a == 901 + // FullSmVersion encoding: SM * 10 + ArchSuffixOffset + // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a') + // e.g. sm_30 -> 300, sm_90a -> 903, sm_100f -> 1002 unsigned int FullSmVersion; // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from diff --git a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll index 9e6beda9b64aa..c115cc546df28 100644 --- a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll +++ b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll @@ -19,10 +19,10 @@ ; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} ; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 --nvptx-short-ptr | %ptxas-verify -arch=sm_110f %} -; RUN: llc -o - -mcpu=sm_120a -march=nvptx64 -mattr=+ptx86 %s | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} -; RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_120a %} +; RUN: llc -o - -mcpu=sm_120a -march=nvptx64 -mattr=+ptx87 %s | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx87 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-sm_120a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx87 | %ptxas-verify -arch=sm_120a %} +; RUN: %if ptxas-sm_120a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx87 --nvptx-short-ptr | %ptxas-verify -arch=sm_120a %} ; RUN: llc -o - -mcpu=sm_120f -march=nvptx64 -mattr=+ptx88 %s | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120f -mattr=+ptx88 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll index 16bd0da8c6a0c..cbf7c114b06ca 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | FileCheck %s -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx87 | FileCheck %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} -; RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} +; RUN: %if ptxas-sm_120a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx87 | %ptxas-verify -arch=sm_120a %} define i16 @cvt_rn_sf_e2m3x2_f32(float %f1, float %f2) { ; CHECK-LABEL: cvt_rn_sf_e2m3x2_f32( diff --git a/llvm/test/CodeGen/NVPTX/convert-sm103a.ll b/llvm/test/CodeGen/NVPTX/convert-sm103a.ll index 54b4dd88867ed..b58c8b3e7abc5 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm103a.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm103a.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx87 | FileCheck %s -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx87 | FileCheck %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx87 | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-sm_103a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx87 | %ptxas-verify -arch=sm_103a %} +; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} ; F16X2 conversions diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll index 97b9d35be371e..db3dd4a9e6011 100644 --- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} +; RUN: llc < %s -mcpu=sm_50 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_50 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx-nvidia-cuda" declare float @llvm.nvvm.ex2.approx.f32(float) diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll index fe205aca7a278..2131014c7c4bd 100644 --- a/llvm/test/CodeGen/NVPTX/fexp2.ll +++ b/llvm/test/CodeGen/NVPTX/fexp2.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s +; RUN: llc < %s -mcpu=sm_50 | FileCheck --check-prefixes=CHECK %s ; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s ; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-BF16 %s -; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} +; RUN: %if ptxas-sm_50 %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %} ; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} ; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll index f5ae1b1f4bd5d..6c71862ac57aa 100644 --- a/llvm/test/CodeGen/NVPTX/flog2.ll +++ b/llvm/test/CodeGen/NVPTX/flog2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | %ptxas-verify -arch=sm_50 %} +; RUN: llc < %s -mcpu=sm_50 -nvptx-approx-log2f32 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_50 %{ llc < %s -mcpu=sm_50 -nvptx-approx-log2f32 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx64-nvidia-cuda" ; CHECK-LABEL: log2_test diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index 75445d625f336..8a5e0a00a20eb 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=nvptx64-- -mcpu=sm_30 2>&1 | FileCheck %s -; RUN: %if ptxas-sm_30 %{ llc < %s -mtriple=nvptx64-- -mcpu=sm_30 | %ptxas-verify -arch=sm_30 %} +; RUN: llc < %s -mtriple=nvptx64-- -mcpu=sm_30 -mattr=+ptx60 2>&1 | FileCheck %s +; RUN: %if ptxas-sm_30 && ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64-- -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify -arch=sm_30 %} define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: srem_i128( diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll index a7f3103e5fcbb..cdbf3c3305305 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 -O0 | FileCheck %s --check-prefixes=SM_52,COMMON ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx64 -O0 | FileCheck %s --check-prefixes=SM_70,COMMON -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx72 -O0 | FileCheck %s --check-prefixes=SM_90,COMMON +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -O0 | FileCheck %s --check-prefixes=SM_90,COMMON @.str = private unnamed_addr constant [12 x i8] c"__CUDA_ARCH\00" @.str1 = constant [11 x i8] c"__CUDA_FTZ\00" diff --git a/llvm/test/CodeGen/NVPTX/ptx-version-validation.ll b/llvm/test/CodeGen/NVPTX/ptx-version-validation.ll new file mode 100644 index 0000000000000..12614e3ef848f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/ptx-version-validation.ll @@ -0,0 +1,51 @@ +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx90 2>&1 | FileCheck %s --check-prefix=CHECK-SM103A-HIGH +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a 2>&1 | FileCheck %s --check-prefix=CHECK-SM103A +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx87 2>&1 | FileCheck %s --check-prefix=CHECK-SM103A-LOW +; RUN: %if ptxas-sm_103a && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx90 | %ptxas-verify -arch=sm_103a %} +; RUN: %if ptxas-sm_103a %{ llc < %s -mtriple=nvptx64 -mcpu=sm_103a | %ptxas-verify -arch=sm_103a %} + +; Test that sm_120a defaults/requires PTX 8.7 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a 2>&1 | FileCheck %s --check-prefix=CHECK-SM120A +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 2>&1 | FileCheck %s --check-prefix=CHECK-SM120A-LOW +; RUN: %if ptxas-sm_120a %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a | %ptxas-verify -arch=sm_120a %} + +; Test that sm_90a defaults/requires PTX 8.0 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90a 2>&1 | FileCheck %s --check-prefix=CHECK-SM90A +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx78 2>&1 | FileCheck %s --check-prefix=CHECK-SM90A-LOW +; RUN: %if ptxas-sm_90a %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90a | %ptxas-verify -arch=sm_90a %} + +; Test older SM targets +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 2>&1 | FileCheck %s --check-prefix=CHECK-SM80 +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx63 2>&1 | FileCheck %s --check-prefix=CHECK-SM80-LOW +; RUN: %if ptxas-sm_80 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} + +; CHECK-SM103A-HIGH: .version 9.0 +; CHECK-SM103A-HIGH: .target sm_103a + +; CHECK-SM103A: .version 8.8 +; CHECK-SM103A: .target sm_103a + +; CHECK-SM103A-LOW: LLVM ERROR: PTX version 8.7 does not support target 'sm_103a'. +; CHECK-SM103A-LOW: Minimum required PTX version is 8.8. + +; CHECK-SM120A: .version 8.7 +; CHECK-SM120A: .target sm_120a + +; CHECK-SM120A-LOW: LLVM ERROR: PTX version 8.6 does not support target 'sm_120a'. +; CHECK-SM120A-LOW: Minimum required PTX version is 8.7. + +; CHECK-SM90A: .version 8.0 +; CHECK-SM90A: .target sm_90a + +; CHECK-SM90A-LOW: LLVM ERROR: PTX version 7.8 does not support target 'sm_90a'. +; CHECK-SM90A-LOW: Minimum required PTX version is 8.0. + +; CHECK-SM80: .version 7.0 +; CHECK-SM80: .target sm_80 + +; CHECK-SM80-LOW: LLVM ERROR: PTX version 6.3 does not support target 'sm_80'. +; CHECK-SM80-LOW: Minimum required PTX version is 7.0. + +define void @foo() { + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/rsqrt.ll b/llvm/test/CodeGen/NVPTX/rsqrt.ll index 0e19dc11021c7..b229510f9bed0 100644 --- a/llvm/test/CodeGen/NVPTX/rsqrt.ll +++ b/llvm/test/CodeGen/NVPTX/rsqrt.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %} +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx40 | FileCheck %s +; RUN: %if ptxas-sm_30 && ptxas-isa-4.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx40 | %ptxas-verify -arch=sm_30 %} ; CHECK-LABEL: .func{{.*}}test1 define float @test1(float %in) local_unnamed_addr { diff --git a/llvm/test/CodeGen/NVPTX/sm-version.ll b/llvm/test/CodeGen/NVPTX/sm-version.ll index c90c086e8b96c..620bfebd12037 100644 --- a/llvm/test/CodeGen/NVPTX/sm-version.ll +++ b/llvm/test/CodeGen/NVPTX/sm-version.ll @@ -76,7 +76,7 @@ ; SM20: .version 3.2 ; SM21: .version 3.2 -; SM30: .version 6.0 +; SM30: .version 3.2 ; SM32: .version 4.0 ; SM35: .version 3.2 ; SM37: .version 4.1 diff --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py index 799ef8c56417d..dc949b879bd1b 100644 --- a/llvm/test/CodeGen/NVPTX/surf-tex.py +++ b/llvm/test/CodeGen/NVPTX/surf-tex.py @@ -1,6 +1,6 @@ # RUN: %python %s --target=cuda --tests=suld,sust,tex,tld4 --gen-list=%t.list > %t-cuda.ll -# RUN: llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll -# RUN: %if ptxas-sm_60 && ptxas-isa-4.3 %{ llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify -arch=sm_60 %} +# RUN: llc -mcpu=sm_60 %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll +# RUN: %if ptxas-sm_60 %{ llc -mcpu=sm_60 %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify -arch=sm_60 %} # We only need to run this second time for texture tests, because # there is a diff erence between unified and non-unified intrinsics. diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py deleted file mode 100644 index 121fa3d8068b1..0000000000000 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py +++ /dev/null @@ -1,14 +0,0 @@ -# Check all variants of instructions supported by PTX86 on SM120a -# RUN: %python %s --ptx=86 --gpu-arch=120a > %t-ptx86-sm_120a.ll -# RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG -# RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \ -# RUN: | FileCheck %t-ptx86-sm_120a.ll -# RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ \ -# RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \ -# RUN: | %ptxas-verify -arch=sm_120a \ -# RUN: %} - -import wmma - -wmma.main() diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h index 4084d3d9ff329..ee3632ba149e5 100644 --- a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h @@ -32,7 +32,7 @@ struct GPUToNVVMPipelineOptions PassOptions::Option<std::string> cubinFeatures{ *this, "cubin-features", llvm::cl::desc("Features to use to serialize to cubin."), - llvm::cl::init("+ptx60")}; + llvm::cl::init("")}; PassOptions::Option<std::string> cubinFormat{ *this, "cubin-format", llvm::cl::desc("Compilation format to use to serialize to cubin."), diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td index 93c19f41a9c4d..3f44888dbd6a0 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td @@ -146,7 +146,7 @@ def GpuNVVMAttachTarget: Pass<"nvvm-attach-target", ""> { /*default=*/"\"sm_75\"", "Target chip.">, Option<"features", "features", "std::string", - /*default=*/"\"+ptx60\"", + /*default=*/"\"\"", "Target features.">, Option<"optLevel", "O", "unsigned", /*default=*/"2", diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 5cd4a1af1f881..a6ffc81c68688 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -6466,7 +6466,7 @@ def NVVM_TargetAttr : NVVM_Attr<"NVVMTarget", "target", DefaultValuedParameter<"int", "2", "Optimization level to apply.">:$O, StringRefParameter<"Target triple.", "\"nvptx64-nvidia-cuda\"">:$triple, StringRefParameter<"Target chip.", "\"sm_75\"">:$chip, - StringRefParameter<"Target chip features.", "\"+ptx60\"">:$features, + StringRefParameter<"Target chip features.", "\"\"">:$features, OptionalParameter<"DictionaryAttr", "Target specific flags.">:$flags, OptionalParameter<"ArrayAttr", "Files to link to the LLVM module.">:$link, DefaultValuedParameter<"bool", "true", "Perform SM version check on Ops.">:$verifyTarget @@ -6478,13 +6478,12 @@ def NVVM_TargetAttr : NVVM_Attr<"NVVMTarget", "target", AttrBuilder<(ins CArg<"int", "2">:$optLevel, CArg<"StringRef", "\"nvptx64-nvidia-cuda\"">:$triple, CArg<"StringRef", "\"sm_75\"">:$chip, - CArg<"StringRef", "\"+ptx60\"">:$features, + CArg<"StringRef", "\"\"">:$features, CArg<"DictionaryAttr", "nullptr">:$targetFlags, CArg<"ArrayAttr", "nullptr">:$linkFiles, CArg<"bool", "true">:$verifyTarget), [{ return $_get($_ctxt, optLevel, triple, chip, features, targetFlags, linkFiles, verifyTarget); - }]> - ]; + }]>]; let skipDefaultBuilders = 1; let genVerifyDecl = 1; let extraClassDeclaration = [{ _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
