[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
shiltian wrote: I'll create a ticket about the decoder. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From 9fbb1e610b0de65ae51bb90bd35146b5f927a46a Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 15 Feb 2024 19:13:44 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- .../builtins-amdgcn-dl-insts-gfx11.cl | 5 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 92 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 57 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 39 llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 15 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 33 +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 - .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 54 +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 16 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 39 llvm/test/MC/AMDGPU/bf16_imm.s| 8 ++ llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt | 9 ++ 16 files changed, 363 insertions(+), 44 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl index dc7069decaaa61..7688dfa55a78e3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl @@ -11,7 +11,10 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true) // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC) -// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC) +// CHECK: [[s1:%[0-9]+]] = bitcast <2 x i16> %v2ssA to <2 x bfloat> +// CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat> +// CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat +// CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]]) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true) // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 79ad6ddf7861fc..883b30562e911b 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const { return isSCSrcB16() || isLiteralImm(MVT::f16); } bool isSSrcV2F16() const { @@ -541,22 +543,40 @@ class AMDGPUOperand : public MCParsedAsmOperand { return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64); } + bool isVCSrcTBF16() const { +return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::bf16); + } + bool isVCSrcTF16() const { return
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -0,0 +1,8 @@ +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck %s + +# CHECK: v_dot2_bf16_bf16 v5, v1, v2, 0x42c8 rampitec wrote: Add encoding to the check lines. Currently it is broken and encoded value is different from decoded one. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From 784670dd98c3727d8d8aa25f865b7b299f114bf4 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 14 Feb 2024 13:11:01 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- .../builtins-amdgcn-dl-insts-gfx11.cl | 5 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 92 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 57 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 39 llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 15 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 ++-- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 - .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 54 +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 16 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 39 llvm/test/MC/AMDGPU/bf16_imm.s| 8 ++ llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt | 8 ++ 16 files changed, 379 insertions(+), 52 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl index dc7069decaaa61..7688dfa55a78e3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl @@ -11,7 +11,10 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true) // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC) -// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC) +// CHECK: [[s1:%[0-9]+]] = bitcast <2 x i16> %v2ssA to <2 x bfloat> +// CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat> +// CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat +// CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]]) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true) // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 79ad6ddf7861fc..883b30562e911b 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const { return isSCSrcB16() || isLiteralImm(MVT::f16); } bool isSSrcV2F16() const { @@ -541,22 +543,40 @@ class AMDGPUOperand : public MCParsedAsmOperand { return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64); } + bool isVCSrcTBF16() const { +return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::bf16); + } + bool isVCSrcTF16() const { re
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From 1488b4e54982be4d3f5bc7f35617effcab52be48 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 14 Feb 2024 09:41:00 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 - .../builtins-amdgcn-dl-insts-gfx11.cl | 5 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 92 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 57 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 39 llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 15 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 ++-- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 - .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 54 +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 16 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 39 llvm/test/MC/AMDGPU/bf16_imm.s| 8 ++ llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt | 8 ++ 17 files changed, 379 insertions(+), 56 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ee0b7504769622..9bc60466d09be6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl index dc7069decaaa61..7688dfa55a78e3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl @@ -11,7 +11,10 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true) // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC) -// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC) +// CHECK: [[s1:%[0-9]+]] = bitcast <2 x i16> %v2ssA to <2 x bfloat> +// CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat> +// CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat +// CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]]) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true) // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + ll
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -0,0 +1,8 @@ +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck %s + +# CHECK: v_dot2_bf16_bf16 v5, v1, v2, 0x42c8 +0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0xc8,0x42,0x00,0x00 + +# CHECK: v_dot2_bf16_bf16 v5, v1, v2, 0x3c00 shiltian wrote: The disassembler doesn't work properly because when it sees `242` with bitwidth 16, it doesn't know whether it is bf16 or fp16. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From 7a517eeab81b45616dd7a1511380f4696304375a Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 13 Feb 2024 21:59:52 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 - .../builtins-amdgcn-dl-insts-gfx11.cl | 5 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 92 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 57 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 39 llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 15 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 ++-- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 - .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 54 +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 16 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 51 -- llvm/test/MC/AMDGPU/bf16_imm.s| 8 ++ llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt | 8 ++ 17 files changed, 379 insertions(+), 68 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ee0b7504769622..9bc60466d09be6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl index dc7069decaaa61..7688dfa55a78e3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl @@ -11,7 +11,10 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true) // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC) -// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC) +// CHECK: [[s1:%[0-9]+]] = bitcast <2 x i16> %v2ssA to <2 x bfloat> +// CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat> +// CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat +// CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]]) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true) // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c +
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From 47b96d282d5416f9dd4c41013d44f8865a1a0d31 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 13 Feb 2024 21:34:44 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 -- .../builtins-amdgcn-dl-insts-gfx11.cl | 5 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +-- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 71 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 57 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 15 llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 54 ++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 16 + llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 51 ++--- llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt | 8 +++ 17 files changed, 326 insertions(+), 68 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ee0b7504769622..9bc60466d09be6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl index dc7069decaaa61..7688dfa55a78e3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl @@ -11,7 +11,10 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true) // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC) -// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC) +// CHECK: [[s1:%[0-9]+]] = bitcast <2 x i16> %v2ssA to <2 x bfloat> +// CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat> +// CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat +// CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]]) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true) // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -4185,9 +4185,17 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: return AMDGPU::isInlinableLiteralV2F16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2BF16: + case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16: +return AMDGPU::isInlinableLiteralV2BF16(Imm); + case AMDGPU::OPERAND_REG_IMM_BF16: case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: + case AMDGPU::OPERAND_REG_INLINE_C_BF16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_BF16: shiltian wrote: Okay, I copied `isInlinableLiteralBF16` from https://github.com/llvm/llvm-project/pull/81345. Initially I thought https://github.com/llvm/llvm-project/pull/81345 would be landed before this one, but I went down that rabbit hole and still didn't make it fully work. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From 22fd628687652e7f9d8795c48e9f794fdd684c45 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 13 Feb 2024 21:27:17 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 -- .../builtins-amdgcn-dl-insts-gfx11.cl | 5 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +-- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 71 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 57 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 15 llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 54 ++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 16 + llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 51 ++--- llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 16 files changed, 318 insertions(+), 68 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ee0b7504769622..9bc60466d09be6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl index dc7069decaaa61..7688dfa55a78e3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl @@ -11,7 +11,10 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true) // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC) -// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC) +// CHECK: [[s1:%[0-9]+]] = bitcast <2 x i16> %v2ssA to <2 x bfloat> +// CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat> +// CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat +// CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]]) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true) // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, In
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -4185,9 +4185,17 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: return AMDGPU::isInlinableLiteralV2F16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2BF16: + case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16: +return AMDGPU::isInlinableLiteralV2BF16(Imm); + case AMDGPU::OPERAND_REG_IMM_BF16: case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: + case AMDGPU::OPERAND_REG_INLINE_C_BF16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_BF16: rampitec wrote: But right in this place you know the actual format. So you can split F16 and BF16 code and call different functions. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From d72bf8bb9d1091ba76e17bf09b0aad9073e18caa Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 13 Feb 2024 19:02:41 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 -- .../builtins-amdgcn-dl-insts-gfx11.cl | 5 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +-- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 71 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 57 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 37 ++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 51 ++--- llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 16 files changed, 293 insertions(+), 68 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ee0b7504769622..9bc60466d09be6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl index dc7069decaaa61..7688dfa55a78e3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl @@ -11,7 +11,10 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true) // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC) -// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC) +// CHECK: [[s1:%[0-9]+]] = bitcast <2 x i16> %v2ssA to <2 x bfloat> +// CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat> +// CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat +// CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]]) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true) // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpec
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -4185,9 +4185,17 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: return AMDGPU::isInlinableLiteralV2F16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2BF16: + case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16: +return AMDGPU::isInlinableLiteralV2BF16(Imm); + case AMDGPU::OPERAND_REG_IMM_BF16: case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: + case AMDGPU::OPERAND_REG_INLINE_C_BF16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_BF16: shiltian wrote: Yes, `isInlinableLiteral16` can't because it can't tell `fp16` and `bf16` apart by just looking at the value. That's the reason I tried really hard to get rid of `isInlinableLiteral16` in #81345 and favors the explicit version. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -488,6 +488,49 @@ static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI, return true; } +static bool printImmediateBFloat16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O) { + if (Imm == 0x3F80) +O << "1.0"; + else if (Imm == 0xBF80) +O << "-1.0"; + else if (Imm == 0x3F00) +O << "0.5"; + else if (Imm == 0xBF00) +O << "-0.5"; + else if (Imm == 0x4000) +O << "2.0"; + else if (Imm == 0xC000) +O << "-2.0"; + else if (Imm == 0x4080) +O << "4.0"; + else if (Imm == 0xC080) +O << "-4.0"; + else if (Imm == 0x3E22 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) +O << "0.15915494"; + else +return false; + + return true; +} + +void AMDGPUInstPrinter::printImmediateBF16(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int16_t SImm = static_cast(Imm); + if (isInlinableIntLiteral(SImm)) { +O << SImm; +return; + } + + uint16_t HImm = static_cast(Imm); + if (printImmediateBFloat16(HImm, STI, O)) +return; + + uint64_t Imm16 = static_cast(Imm); shiltian wrote: Yeah, but it is to make the type promoted to `uint64_t` w/o any ambiguity. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r shiltian wrote: The cast will be inserted automatically in `clang/lib/CodeGen/CGBuiltin.cpp` after removing the two assertions. I reverted my change to the test case by accident. Lol https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r rampitec wrote: clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl fails. You need to insert casts to bf16 while lowering it to make it working. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -4185,9 +4185,17 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: return AMDGPU::isInlinableLiteralV2F16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2BF16: + case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16: +return AMDGPU::isInlinableLiteralV2BF16(Imm); + case AMDGPU::OPERAND_REG_IMM_BF16: case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: + case AMDGPU::OPERAND_REG_INLINE_C_BF16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_BF16: rampitec wrote: It seems isInlinableLiteral16() cannot handle bf16? https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -488,6 +488,49 @@ static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI, return true; } +static bool printImmediateBFloat16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O) { + if (Imm == 0x3F80) +O << "1.0"; + else if (Imm == 0xBF80) +O << "-1.0"; + else if (Imm == 0x3F00) +O << "0.5"; + else if (Imm == 0xBF00) +O << "-0.5"; + else if (Imm == 0x4000) +O << "2.0"; + else if (Imm == 0xC000) +O << "-2.0"; + else if (Imm == 0x4080) +O << "4.0"; + else if (Imm == 0xC080) +O << "-4.0"; + else if (Imm == 0x3E22 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) +O << "0.15915494"; + else +return false; + + return true; +} + +void AMDGPUInstPrinter::printImmediateBF16(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int16_t SImm = static_cast(Imm); + if (isInlinableIntLiteral(SImm)) { +O << SImm; +return; + } + + uint16_t HImm = static_cast(Imm); + if (printImmediateBFloat16(HImm, STI, O)) +return; + + uint64_t Imm16 = static_cast(Imm); rampitec wrote: It's the same as HImm above. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11 rampitec wrote: Change 'RUN' with 'XUN' and add a comment instead. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -0,0 +1,8 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck %s rampitec wrote: You also need a disasm test for this. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11 shiltian wrote: This change is because of the discussion here (https://github.com/llvm/llvm-project/pull/80908/files#r1483394728). https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From bfd3170dc5e4d6e53fb98b46b37f2bf3c3ebf86d Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 13 Feb 2024 17:39:23 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 -- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +-- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 71 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 59 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 37 ++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 49 + llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 15 files changed, 289 insertions(+), 67 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ee0b7504769622..9bc60466d09be6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 79ad6ddf7861fc..09f25215beb9e5 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const { return isSCSrcB16() || isLiteralImm(MVT::f16); } bool isSSrcV2F16() const { @@ -541,22 +543,40 @@ class AMDGPUOperand : public MCParsedAsmOperand { return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64); } + bool isVCSrcTBF16() const { +return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::bf16); + } + bool isVCSrcTF16() const { return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::f16); } + bool isVCSrcTBF16_Lo128() const { +return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::bf16); + } + bool isVCSrcTF16_Lo128() const { return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::f16); } + bool isVCSrcFake16BF16_Lo128() const { +return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::bf16); + } + bool isVCSrcFake16F16_Lo128() const { return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT:
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -79,17 +79,17 @@ define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT:v_mov_b32_e32 v2, s1 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x10001, v2 +; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2 shiltian wrote: FWIW, #81345 can solve the issue but I'm struggling with getting two test cases passed. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
shiltian wrote: The patch is in a good shape now. I have made two other prime patches (#81674 and #81669). I'll rebase this one once they are landed. This patch only changes one bf16 instruction with the necessary infrastructure for others. I'll update all of them once this patch is landed. However, I don't think `isInlinableLiteral16` works correctly because the encoding of the floating point inline literals are different for fp16 and bf16, but apparently for now it can only recognize fp16 encoding. This patch at least makes the asm printer work properly. #81345 is trying to fix it correctly, but that is unrelated to this patch. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From c556e40c13adb9d253ef7c5ebb2b46cb12969d46 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 13 Feb 2024 15:30:51 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 -- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +-- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 5 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 71 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 59 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 37 ++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +- llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 16 files changed, 292 insertions(+), 56 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ee0b7504769622..9bc60466d09be6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 311dd9d9739a6d..3290262816ef0a 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && + (U.getType()->getScalarType()->isBFloatTy() || + U.getOperand(0)->getType()->getScalarType()->isBFloatTy())) return false; Register Op = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 79ad6ddf7861fc..09f25215beb9e5 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const { ret
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From df3dbb6b9c257157c4afb407e40447a25c27a2a8 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 12 Feb 2024 18:03:57 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 - llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 5 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 71 ++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 59 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 22 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 74 --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 - llvm/test/MC/AMDGPU/bf16_imm.s| 10 +++ 16 files changed, 323 insertions(+), 65 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a7a410dab1a018..daf651917f2a96 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index c1d8e890a66edb..828229f3e569e3 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && + (U.getType()->getScalarType()->isBFloatTy() || + U.getOperand(0)->getType()->getScalarType()->isBFloatTy())) return false; Register Op = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 79ad6ddf7861fc..09f25215beb9e5 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const {
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -79,17 +79,17 @@ define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT:v_mov_b32_e32 v2, s1 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x10001, v2 +; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2 rampitec wrote: Well, this is unrelated to the patch itself. We can use inline 1.0 here, but then we must use op_sel_hi to produce it in the high half. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -2660,15 +2660,34 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { return true; uint16_t Val = static_cast(Literal); - return Val == 0x3C00 || // 1.0 - Val == 0xBC00 || // -1.0 - Val == 0x3800 || // 0.5 - Val == 0xB800 || // -0.5 - Val == 0x4000 || // 2.0 - Val == 0xC000 || // -2.0 - Val == 0x4400 || // 4.0 - Val == 0xC400 || // -4.0 - Val == 0x3118; // 1/2pi + + // FP16 + if (Val == 0x3C00 || // 1.0 shiltian wrote: This function might be removed eventually in https://github.com/llvm/llvm-project/pull/81345. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -2730,6 +2749,12 @@ std::optional getInlineEncodingV2I16(uint32_t Literal) { return getInlineEncodingV216(false, Literal); } +// Encoding of the literal as an inline constant for a V_PK_*_BF16 instruction +// or nullopt. +std::optional getInlineEncodingV2BF16(uint32_t Literal) { + return getInlineEncodingV216(true, Literal); shiltian wrote: This part is still WIP along with https://github.com/llvm/llvm-project/pull/81345. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From 4196e998349d663a9a9922937cc4bedbec95fe5f Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 12 Feb 2024 13:48:39 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 -- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +-- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 5 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 71 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 59 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 22 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 48 ++--- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +- llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 16 files changed, 295 insertions(+), 65 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a7a410dab1a018..daf651917f2a96 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index c1d8e890a66edb..828229f3e569e3 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && + (U.getType()->getScalarType()->isBFloatTy() || + U.getOperand(0)->getType()->getScalarType()->isBFloatTy())) return false; Register Op = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a94da992b33859..65d6fb587c19ca 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const {
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -79,17 +79,17 @@ define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT:v_mov_b32_e32 v2, s1 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x10001, v2 +; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2 shiltian wrote: Yeah, but I tested the FP16 version `llvm.amdgcn.fdot2.f16.f16` (w/ trunk w/o my patch), it generates `v_dot2_f16_f16 v2, s0, 0x3c003c00, v2`. I think we generally have issues with showing inline literals. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -4181,13 +4181,20 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: return AMDGPU::isInlinableLiteralV2I16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2BF16: shiltian wrote: Yeah, I made some mistakes here. Will take care of them. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/rampitec edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -4181,13 +4181,20 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: return AMDGPU::isInlinableLiteralV2I16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2BF16: rampitec wrote: It does not seem isInlinableLiteralV2F16() can handle bf16. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -79,17 +79,17 @@ define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT:v_mov_b32_e32 v2, s1 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x10001, v2 +; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2 rampitec wrote: This shall be encoded as inline immediate 1.0. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -521,8 +521,11 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, if (printImmediateFloat32(Imm, STI, O)) return; break; + case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16: rampitec wrote: It does not seem right, and there are no tests for v2bf16 added. I am not sure though we have instructions which can accept this type of operand. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && rampitec wrote: This is actually an orthogonal problem. Global ISel is completely broken for bf16 and whatever the outcome of the supporting bf16 in codegen is we just need to be ready some gisel tests will fail and will need to be disabled. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -0,0 +1,8 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck %s + +v_dot2_bf16_bf16 v5, v1, v2, 100.0 +// CHECK: v_dot2_bf16_bf16 v5, v1, v2, 0x42c8 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0xc8,0x42,0x00,0x00] + +v_dot2_bf16_bf16 v5, v1, v2, 1.0 +// v_dot2_bf16_bf16 v5, v1, v2, 0x3f80 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x80,0x3f,0x00,0x00] rampitec wrote: FYI: this shall be inline literal. I.e: 0xd6672005 0x03ca0501 https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && shiltian wrote: This change might need to go to a separate patch. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
llvmbot wrote: @llvm/pr-subscribers-llvm-globalisel Author: Shilei Tian (shiltian) Changes Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. --- Patch is 32.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80908.diff 14 Files Affected: - (modified) clang/lib/CodeGen/CGBuiltin.cpp (-4) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+4-4) - (modified) llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp (+3-2) - (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+66) - (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (+10) - (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp (+7) - (modified) llvm/lib/Target/AMDGPU/SIDefines.h (+7) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+7) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+32-26) - (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+21-1) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+7) - (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll (+18-18) - (added) llvm/test/MC/AMDGPU/bf16_imm.s (+8) ``diff diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a7a410dab1a018..daf651917f2a96 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index c1d8e890a66edb..828229f3e569e3 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && + (U.getType()->getScalarType()->isBFloatTy() || + U.getOperand(0)->getType()->getScalarType()->isBFloatTy())) return false; Register Op = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a94da992b33859..d6d96c251f7e30 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const { return isSCSrcB16() || isLiteralImm(MVT::f16); } bool isSSrcV2F16() const { @@ -541,22 +543,40 @@ class AMDGPUOperand : public MCParsedAsmOperand { return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64); } + bool isVCSrcTBF16() const { +return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::bf16); + } + bool isVCSrcTF16() const { return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::f16); } + b
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
llvmbot wrote: @llvm/pr-subscribers-llvm-ir Author: Shilei Tian (shiltian) Changes Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. --- Patch is 32.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80908.diff 14 Files Affected: - (modified) clang/lib/CodeGen/CGBuiltin.cpp (-4) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+4-4) - (modified) llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp (+3-2) - (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+66) - (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (+10) - (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp (+7) - (modified) llvm/lib/Target/AMDGPU/SIDefines.h (+7) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+7) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+32-26) - (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+21-1) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+7) - (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll (+18-18) - (added) llvm/test/MC/AMDGPU/bf16_imm.s (+8) ``diff diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a7a410dab1a018..daf651917f2a96 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index c1d8e890a66edb..828229f3e569e3 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && + (U.getType()->getScalarType()->isBFloatTy() || + U.getOperand(0)->getType()->getScalarType()->isBFloatTy())) return false; Register Op = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a94da992b33859..d6d96c251f7e30 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const { return isSCSrcB16() || isLiteralImm(MVT::f16); } bool isSSrcV2F16() const { @@ -541,22 +543,40 @@ class AMDGPUOperand : public MCParsedAsmOperand { return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64); } + bool isVCSrcTBF16() const { +return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::bf16); + } + bool isVCSrcTF16() const { return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::f16); } + bool isVC
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -0,0 +1,8 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck %s + +v_dot2_bf16_bf16 v5, v1, v2, 100.0 shiltian wrote: The two instructions are from #79369 https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian ready_for_review https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits