[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian closed https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/rampitec approved this pull request. Thanks. There are definitely at least 2 outstanding problems, but it seems there are no regressions comparing to what we have now. LGTM. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From 5b66bb22a91690078a955cea6c02b6b746b6502b Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 16 Feb 2024 15:08:35 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- .../builtins-amdgcn-dl-insts-gfx11.cl | 5 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 92 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 57 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 39 llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 15 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 16 ++-- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 - .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 54 +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 16 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 39 llvm/test/MC/AMDGPU/bf16_imm.s| 14 +++ llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt | 16 16 files changed, 369 insertions(+), 34 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl index dc7069decaaa61..7688dfa55a78e3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl @@ -11,7 +11,10 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true) // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC) -// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC) +// CHECK: [[s1:%[0-9]+]] = bitcast <2 x i16> %v2ssA to <2 x bfloat> +// CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat> +// CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat +// CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]]) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true) // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 79ad6ddf7861fc..883b30562e911b 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const { return isSCSrcB16() || isLiteralImm(MVT::f16); } bool isSSrcV2F16() const { @@ -541,22 +543,40 @@ class AMDGPUOperand : public MCParsedAsmOperand { return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64); } + bool isVCSrcTBF16() const { +return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::bf16); + } + bool isVCSrcTF16() const { return
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -0,0 +1,8 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck %s + +v_dot2_bf16_bf16 v5, v1, v2, 100.0 +// CHECK: v_dot2_bf16_bf16 v5, v1, v2, 0x42c8 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0xc8,0x42,0x00,0x00] + +v_dot2_bf16_bf16 v5, v1, v2, 1.0 +// CHECK: v_dot2_bf16_bf16 v5, v1, v2, 1.0 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xca,0x03] rampitec wrote: Wow! Yeah, it's another ticket. Looks like a can of worms. Add at least 'v_dot2_bf16_bf16 v2, v0, 1.0, v2', this one works. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -0,0 +1,8 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck %s + +v_dot2_bf16_bf16 v5, v1, v2, 100.0 +// CHECK: v_dot2_bf16_bf16 v5, v1, v2, 0x42c8 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0xc8,0x42,0x00,0x00] + +v_dot2_bf16_bf16 v5, v1, v2, 1.0 +// CHECK: v_dot2_bf16_bf16 v5, v1, v2, 1.0 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xca,0x03] shiltian wrote: It looks like our assembler doesn't like the case such as `v_dot2_bf16_bf16 v5, v1, 0x42c842c8, 0x42c8`, even w/o this patch. Even the asm (`v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2`) generated in another file above (`llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll`) can't be recognized by our assembler. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -0,0 +1,8 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck %s + +v_dot2_bf16_bf16 v5, v1, v2, 100.0 +// CHECK: v_dot2_bf16_bf16 v5, v1, v2, 0x42c8 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0xc8,0x42,0x00,0x00] + +v_dot2_bf16_bf16 v5, v1, v2, 1.0 +// CHECK: v_dot2_bf16_bf16 v5, v1, v2, 1.0 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xca,0x03] rampitec wrote: Can you add couple more tests here? The same instruction, but with the immediate in place of v2bf16 operand, so we test that code path too. Both immediate and inline literal. Here and in disasm. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -2652,6 +2652,23 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) { (Val == 0x3e22f983 && HasInv2Pi); } +bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi) { + if (!HasInv2Pi) +return false; rampitec wrote: It does not change the behavior, but generally it shall only matter when you compare value to 0x3E22. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -157,6 +157,27 @@ static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo ) { return 255; } +static uint32_t getLitBF16Encoding(uint16_t Val) { + uint16_t IntImm = getIntInlineImmEncoding(static_cast(Val)); + if (IntImm != 0) +return IntImm; + + // clang-format off + switch (Val) { jayfoad wrote: Yeah, I really don't like having 4 different copies of this list of hex values (0x3f00, 0xbf00...). https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -157,6 +157,27 @@ static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo ) { return 255; } +static uint32_t getLitBF16Encoding(uint16_t Val) { + uint16_t IntImm = getIntInlineImmEncoding(static_cast(Val)); + if (IntImm != 0) +return IntImm; + + // clang-format off + switch (Val) { shiltian wrote: In theory, yes, but for now we can't because `getInlineEncodingV2BF16` can't handle some cases (that I didn't dig yet). It looks like in the conversion between `uint16_t` and `uint32_t` that makes some test cases fail. IMO we need to unify them (not only for 16-bit) in one place instead of having almost the same logic at least in three places. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -157,6 +157,27 @@ static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo ) { return 255; } +static uint32_t getLitBF16Encoding(uint16_t Val) { + uint16_t IntImm = getIntInlineImmEncoding(static_cast(Val)); + if (IntImm != 0) +return IntImm; + + // clang-format off + switch (Val) { jayfoad wrote: Can this call `getInlineEncodingV2BF16`? https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From d95e99ebcefa76ba2e8068f663be86340c14ab5b Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 16 Feb 2024 11:29:47 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- .../builtins-amdgcn-dl-insts-gfx11.cl | 5 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 92 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 57 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 39 llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 15 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 16 ++-- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 - .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 54 +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 16 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 39 llvm/test/MC/AMDGPU/bf16_imm.s| 8 ++ llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt | 9 ++ 16 files changed, 356 insertions(+), 34 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl index dc7069decaaa61..7688dfa55a78e3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl @@ -11,7 +11,10 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true) // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC) -// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC) +// CHECK: [[s1:%[0-9]+]] = bitcast <2 x i16> %v2ssA to <2 x bfloat> +// CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat> +// CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat +// CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]]) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true) // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 79ad6ddf7861fc..883b30562e911b 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const { return isSCSrcB16() || isLiteralImm(MVT::f16); } bool isSSrcV2F16() const { @@ -541,22 +543,40 @@ class AMDGPUOperand : public MCParsedAsmOperand { return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64); } + bool isVCSrcTBF16() const { +return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::bf16); + } + bool isVCSrcTF16() const { return
[clang] [llvm] [AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits