https://github.com/keshavvinayak01 updated https://github.com/llvm/llvm-project/pull/157748
>From 49f6936ece68e845d956efe3e04855e49955bb5b Mon Sep 17 00:00:00 2001 From: keshavvinayak01 <keshavvinayak...@gmail.com> Date: Tue, 9 Sep 2025 20:50:03 +0000 Subject: [PATCH 1/5] Added Intrinsics for smed, umed, to support ISA instructions from ROCDL Signed-off-by: keshavvinayak01 <keshavvinayak...@gmail.com> --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 8 + llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 12 ++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 4 +- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 172 ++++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 19 +- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 22 +++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 + mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 88 +++++++++ 9 files changed, 323 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e5a1422fe8778..b923c519c56cd 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -125,6 +125,8 @@ BUILTIN(__builtin_amdgcn_ds_bpermute, "iii", "nc") BUILTIN(__builtin_amdgcn_readfirstlane, "ii", "nc") BUILTIN(__builtin_amdgcn_readlane, "iii", "nc") BUILTIN(__builtin_amdgcn_fmed3f, "ffff", "nc") +BUILTIN(__builtin_amdgcn_smed3, "iiii", "nc") +BUILTIN(__builtin_amdgcn_umed3, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_ds_faddf, "ff*3fIiIiIb", "n") BUILTIN(__builtin_amdgcn_ds_fminf, "ff*3fIiIiIb", "n") BUILTIN(__builtin_amdgcn_ds_fmaxf, "ff*3fIiIiIb", "n") @@ -251,6 +253,8 @@ TARGET_BUILTIN(__builtin_amdgcn_perm, "UiUiUiUi", "nc", "gfx8-insts") //===----------------------------------------------------------------------===// TARGET_BUILTIN(__builtin_amdgcn_fmed3h, "hhhh", "nc", "gfx9-insts") +TARGET_BUILTIN(__builtin_amdgcn_smed3h, "ssss", "nc", "gfx9-insts") +TARGET_BUILTIN(__builtin_amdgcn_umed3h, "UsUsUsUs", "nc", "gfx9-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f64, "dd*1d", "t", "gfx90a-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1f", "t", "atomic-fadd-rtn-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 87a46287c4022..b189fd745aa2d 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -548,6 +548,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_fmed3h: return emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::amdgcn_fmed3); + case AMDGPU::BI__builtin_amdgcn_smed3: + case AMDGPU::BI__builtin_amdgcn_smed3h: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_smed3); + case AMDGPU::BI__builtin_amdgcn_umed3: + case AMDGPU::BI__builtin_amdgcn_umed3h: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_umed3); case AMDGPU::BI__builtin_amdgcn_ds_append: case AMDGPU::BI__builtin_amdgcn_ds_consume: { Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ? diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 5bbc16f2dc743..00e017040ec1e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -531,6 +531,18 @@ def int_amdgcn_fmed3 : [IntrNoMem, IntrSpeculatable] >; +def int_amdgcn_smed3 : + DefaultAttrsIntrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable] +>; + +def int_amdgcn_umed3 : + DefaultAttrsIntrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable] +>; + def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">, DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 0c112d1787c1a..b76956b6b4d24 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -256,8 +256,8 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>; def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>; def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>; -def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>; -def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>; +def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3_impl>; +def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3_impl>; def : GINodeEquiv<G_AMDGPU_FMED3, AMDGPUfmed3_impl>; def : GINodeEquiv<G_AMDGPU_CLAMP, AMDGPUclamp>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 4fe5d00679436..1efa5b9861188 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -60,6 +60,26 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, return maxnum(Src0, Src1); } +// Constant fold llvm.amdgcn.smed3 intrinsics for standard inputs. +static APInt smed3AMDGCN(const APInt &Src0, const APInt &Src1, const APInt &Src2) { + APInt Max3 = Src0.sgt(Src1) ? (Src0.sgt(Src2) ? Src0 : Src2) + : (Src1.sgt(Src2) ? Src1 : Src2); + + if (Max3 == Src0) return Src1.sgt(Src2) ? Src1 : Src2; + if (Max3 == Src1) return Src0.sgt(Src2) ? Src0 : Src2; + return Src0.sgt(Src1) ? Src0 : Src1; +} + +// Constant fold llvm.amdgcn.umed3 intrinsics for standard inputs. +static APInt umed3AMDGCN(const APInt &Src0, const APInt &Src1, const APInt &Src2) { + APInt Max3 = Src0.ugt(Src1) ? (Src0.ugt(Src2) ? Src0 : Src2) + : (Src1.ugt(Src2) ? Src1 : Src2); + + if (Max3 == Src0) return Src1.ugt(Src2) ? Src1 : Src2; + if (Max3 == Src1) return Src0.ugt(Src2) ? Src0 : Src2; + return Src0.ugt(Src1) ? Src0 : Src1; +} + // Check if a value can be converted to a 16-bit value without losing // precision. // The value is expected to be either a float (IsFloat = true) or an unsigned @@ -427,6 +447,36 @@ static Value *matchFPExtFromF16(Value *Arg) { return nullptr; } +/// Match an sext from i16 to i32, or a constant we can convert. +static Value *matchSExtFromI16(Value *Arg) { + Value *Src = nullptr; + ConstantInt *CInt = nullptr; + if (match(Arg, m_OneUse(m_SExt(m_Value(Src))))) { + if (Src->getType()->isIntegerTy(16)) + return Src; + } else if (match(Arg, m_ConstantInt(CInt))) { + // Check if the constant fits in i16 + if (CInt->getValue().getMinSignedBits() <= 16) + return ConstantInt::get(Type::getInt16Ty(Arg->getContext()), CInt->getValue().trunc(16)); + } + return nullptr; +} + +/// Match a zext from i16 to i32, or a constant we can convert. +static Value *matchZExtFromI16(Value *Arg) { + Value *Src = nullptr; + ConstantInt *CInt = nullptr; + if (match(Arg, m_OneUse(m_ZExt(m_Value(Src))))) { + if (Src->getType()->isIntegerTy(16)) + return Src; + } else if (match(Arg, m_ConstantInt(CInt))) { + // Check if the constant fits in i16 + if (CInt->getValue().getActiveBits() <= 16) + return ConstantInt::get(Type::getInt16Ty(Arg->getContext()), CInt->getValue().trunc(16)); + } + return nullptr; +} + // Trim all zero components from the end of the vector \p UseV and return // an appropriate bitset with known elements. static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, @@ -1174,6 +1224,128 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } + case Intrinsic::amdgcn_smed3: { + Value *Src0 = II.getArgOperand(0); + Value *Src1 = II.getArgOperand(1); + Value *Src2 = II.getArgOperand(2); + + // Propagate poison values. + for (Value *Src : {Src0, Src1, Src2}) { + if (isa<PoisonValue>(Src)) + return IC.replaceInstUsesWith(II, Src); + } + + bool Swap = false; + // Canonicalize constants to RHS operands. + // + // smed3(c0, x, c1) -> smed3(x, c0, c1) + if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { + std::swap(Src0, Src1); + Swap = true; + } + + if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { + std::swap(Src1, Src2); + Swap = true; + } + + if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { + std::swap(Src0, Src1); + Swap = true; + } + + if (Swap) { + II.setArgOperand(0, Src0); + II.setArgOperand(1, Src1); + II.setArgOperand(2, Src2); + return &II; + } + + // Constant fold smed3 with constant operands. + if (const ConstantInt *C0 = dyn_cast<ConstantInt>(Src0)) { + if (const ConstantInt *C1 = dyn_cast<ConstantInt>(Src1)) { + if (const ConstantInt *C2 = dyn_cast<ConstantInt>(Src2)) { + APInt Result = smed3AMDGCN(C0->getValue(), C1->getValue(), C2->getValue()); + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); + } + } + } + + // Width reduction for integer extensions. + // smed3((sext X), (sext Y), (sext Z)) -> sext (smed3(X, Y, Z)) + if (Value *X = matchSExtFromI16(Src0)) { + if (Value *Y = matchSExtFromI16(Src1)) { + if (Value *Z = matchSExtFromI16(Src2)) { + Value *NewCall = IC.Builder.CreateIntrinsic( + IID, {X->getType()}, {X, Y, Z}, &II, II.getName()); + return new SExtInst(NewCall, II.getType()); + } + } + } + + break; + } + case Intrinsic::amdgcn_umed3: { + Value *Src0 = II.getArgOperand(0); + Value *Src1 = II.getArgOperand(1); + Value *Src2 = II.getArgOperand(2); + + // Propagate poison values. + for (Value *Src : {Src0, Src1, Src2}) { + if (isa<PoisonValue>(Src)) + return IC.replaceInstUsesWith(II, Src); + } + + bool Swap = false; + // Canonicalize constants to RHS operands. + // + // umed3(c0, x, c1) -> umed3(x, c0, c1) + if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { + std::swap(Src0, Src1); + Swap = true; + } + + if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { + std::swap(Src1, Src2); + Swap = true; + } + + if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { + std::swap(Src0, Src1); + Swap = true; + } + + if (Swap) { + II.setArgOperand(0, Src0); + II.setArgOperand(1, Src1); + II.setArgOperand(2, Src2); + return &II; + } + + // Constant fold umed3 with constant operands. + if (const ConstantInt *C0 = dyn_cast<ConstantInt>(Src0)) { + if (const ConstantInt *C1 = dyn_cast<ConstantInt>(Src1)) { + if (const ConstantInt *C2 = dyn_cast<ConstantInt>(Src2)) { + APInt Result = umed3AMDGCN(C0->getValue(), C1->getValue(), C2->getValue()); + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); + } + } + } + + // Width reduction for integer extensions. + // umed3((zext X), (zext Y), (zext Z)) -> zext (umed3(X, Y, Z)) + if (Value *X = matchZExtFromI16(Src0)) { + if (Value *Y = matchZExtFromI16(Src1)) { + if (Value *Z = matchZExtFromI16(Src2)) { + Value *NewCall = IC.Builder.CreateIntrinsic( + IID, {X->getType()}, {X, Y, Z}, &II, II.getName()); + return new ZExtInst(NewCall, II.getType()); + } + } + } + + break; + } case Intrinsic::amdgcn_icmp: case Intrinsic::amdgcn_fcmp: { const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index b8fa6f3fc6867..e9680e062cffa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -334,16 +334,13 @@ def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, [] >; -def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp, - [] ->; - -def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, - [] ->; def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; +def AMDGPUsmed3_impl : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp, []>; + +def AMDGPUumed3_impl : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, []>; + def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2", SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, SDTCisFP<0>, SDTCisVec<1>, @@ -448,6 +445,14 @@ def AMDGPUfmed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2), [(int_amdgcn_fmed3 node:$src0, node:$src1, node:$src2), (AMDGPUfmed3_impl node:$src0, node:$src1, node:$src2)]>; +def AMDGPUsmed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(int_amdgcn_smed3 node:$src0, node:$src1, node:$src2), + (AMDGPUsmed3_impl node:$src0, node:$src1, node:$src2)]>; + +def AMDGPUumed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(int_amdgcn_umed3 node:$src0, node:$src1, node:$src2), + (AMDGPUumed3_impl node:$src0, node:$src1, node:$src2)]>; + def AMDGPUdiv_fixup : PatFrags<(ops node:$src0, node:$src1, node:$src2), [(int_amdgcn_div_fixup node:$src0, node:$src1, node:$src2), (AMDGPUdiv_fixup_impl node:$src0, node:$src1, node:$src2)]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index f18536cd4ab93..5b91da5ef81fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -7797,6 +7797,28 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.removeOperand(1); Observer.changedInstr(MI); return true; + }` + case Intrinsic::amdgcn_smed3: { + GISelChangeObserver &Observer = Helper.Observer; + + // FIXME: This is to workaround the inability of tablegen match combiners to + // match intrinsics in patterns. + Observer.changingInstr(MI); + MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_SMED3)); + MI.removeOperand(1); + Observer.changedInstr(MI); + return true; + } + case Intrinsic::amdgcn_umed3: { + GISelChangeObserver &Observer = Helper.Observer; + + // FIXME: This is to workaround the inability of tablegen match combiners to + // match intrinsics in patterns. + Observer.changingInstr(MI); + MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_UMED3)); + MI.removeOperand(1); + Observer.changedInstr(MI); + return true; } case Intrinsic::amdgcn_readlane: case Intrinsic::amdgcn_writelane: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 36b27bef350ed..63141d065bf65 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4136,6 +4136,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: case AMDGPU::G_AMDGPU_SMED3: + case AMDGPU::G_AMDGPU_UMED3: case AMDGPU::G_AMDGPU_FMED3: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: @@ -4660,6 +4661,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16: case Intrinsic::amdgcn_sat_pk4_i4_i8: case Intrinsic::amdgcn_sat_pk4_u4_u8: + case Intrinsic::amdgcn_smed3: + case Intrinsic::amdgcn_umed3: case Intrinsic::amdgcn_fmed3: case Intrinsic::amdgcn_cubeid: case Intrinsic::amdgcn_cubema: diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 9fa3ec1fc4b21..5e757ae337879 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -1291,6 +1291,94 @@ def ROCDL_CvtScaleF32PkFp4F32Op : }]; } +//===----------------------------------------------------------------------===// +// MED3 operations +//===----------------------------------------------------------------------===// + +def ROCDL_Med3F16Op : ROCDL_ConcreteNonMemIntrOp<"med3.f16", [Pure], 1>, + Arguments<(ins F16:$src0, + F16:$src1, + F16:$src2)> { + let results = (outs F16:$res); + let summary = "Median of three half-precision float values"; + let assemblyFormat = [{ + $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) + }]; + string llvmBuilder = [{ + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_med3_f16, {$src0, $src1, $src2}); + }]; +} + +def ROCDL_Med3F32Op : ROCDL_ConcreteNonMemIntrOp<"med3.f32", [Pure], 1>, + Arguments<(ins F32:$src0, + F32:$src1, + F32:$src2)> { + let results = (outs F32:$res); + let summary = "Median of three single-precision float values"; + let assemblyFormat = [{ + $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) + }]; + string llvmBuilder = [{ + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_med3_f32, {$src0, $src1, $src2}); + }]; +} + +def ROCDL_Med3I16Op : ROCDL_ConcreteNonMemIntrOp<"med3.i16", [Pure], 1>, + Arguments<(ins I16:$src0, + I16:$src1, + I16:$src2)> { + let results = (outs I16:$res); + let summary = "Median of three signed 16-bit integer values"; + let assemblyFormat = [{ + $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) + }]; + string llvmBuilder = [{ + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_smed3, {$src0, $src1, $src2}); + }]; +} + +def ROCDL_Med3I32Op : ROCDL_ConcreteNonMemIntrOp<"med3.i32", [Pure], 1>, + Arguments<(ins I32:$src0, + I32:$src1, + I32:$src2)> { + let results = (outs I32:$res); + let summary = "Median of three signed 32-bit integer values"; + let assemblyFormat = [{ + $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) + }]; + string llvmBuilder = [{ + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_smed3, {$src0, $src1, $src2}); + }]; +} + +def ROCDL_Med3U16Op : ROCDL_ConcreteNonMemIntrOp<"med3.u16", [Pure], 1>, + Arguments<(ins I16:$src0, + I16:$src1, + I16:$src2)> { + let results = (outs I16:$res); + let summary = "Median of three unsigned 16-bit integer values"; + let assemblyFormat = [{ + $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) + }]; + string llvmBuilder = [{ + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_umed3, {$src0, $src1, $src2}); + }]; +} + +def ROCDL_Med3U32Op : ROCDL_ConcreteNonMemIntrOp<"med3.u32", [Pure], 1>, + Arguments<(ins I32:$src0, + I32:$src1, + I32:$src2)> { + let results = (outs I32:$res); + let summary = "Median of three unsigned 32-bit integer values"; + let assemblyFormat = [{ + $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) + }]; + string llvmBuilder = [{ + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_umed3, {$src0, $src1, $src2}); + }]; +} + //===----------------------------------------------------------------------===// // ROCDL target attribute. //===----------------------------------------------------------------------===// >From 6cb12ec71b948812f9ab0467d4d7c3c61e75f439 Mon Sep 17 00:00:00 2001 From: keshavvinayak01 <keshavvinayak...@gmail.com> Date: Wed, 10 Sep 2025 18:19:55 +0000 Subject: [PATCH 2/5] Tested succesful rocdl -> llvm rewrites for smed,umed,fmed Signed-off-by: keshavvinayak01 <keshavvinayak...@gmail.com> --- .../lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 2 +- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 1efa5b9861188..c6cb4736f95df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -456,7 +456,7 @@ static Value *matchSExtFromI16(Value *Arg) { return Src; } else if (match(Arg, m_ConstantInt(CInt))) { // Check if the constant fits in i16 - if (CInt->getValue().getMinSignedBits() <= 16) + if (CInt->getValue().getActiveBits() <= 16) return ConstantInt::get(Type::getInt16Ty(Arg->getContext()), CInt->getValue().trunc(16)); } return nullptr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 5b91da5ef81fb..5da1e04c58bae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -7797,7 +7797,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.removeOperand(1); Observer.changedInstr(MI); return true; - }` + } case Intrinsic::amdgcn_smed3: { GISelChangeObserver &Observer = Helper.Observer; diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 5e757ae337879..8762f5e2c73a3 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -1305,7 +1305,7 @@ def ROCDL_Med3F16Op : ROCDL_ConcreteNonMemIntrOp<"med3.f16", [Pure], 1>, $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) }]; string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_med3_f16, {$src0, $src1, $src2}); + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_fmed3, {$src0, $src1, $src2}, {moduleTranslation.convertType(op.getSrc0().getType())}); }]; } @@ -1319,7 +1319,7 @@ def ROCDL_Med3F32Op : ROCDL_ConcreteNonMemIntrOp<"med3.f32", [Pure], 1>, $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) }]; string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_med3_f32, {$src0, $src1, $src2}); + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_fmed3, {$src0, $src1, $src2}, {moduleTranslation.convertType(op.getSrc0().getType())}); }]; } @@ -1333,7 +1333,7 @@ def ROCDL_Med3I16Op : ROCDL_ConcreteNonMemIntrOp<"med3.i16", [Pure], 1>, $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) }]; string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_smed3, {$src0, $src1, $src2}); + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_smed3, {$src0, $src1, $src2}, {moduleTranslation.convertType(op.getSrc0().getType())}); }]; } @@ -1347,7 +1347,7 @@ def ROCDL_Med3I32Op : ROCDL_ConcreteNonMemIntrOp<"med3.i32", [Pure], 1>, $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) }]; string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_smed3, {$src0, $src1, $src2}); + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_smed3, {$src0, $src1, $src2}, {moduleTranslation.convertType(op.getSrc0().getType())}); }]; } @@ -1361,7 +1361,7 @@ def ROCDL_Med3U16Op : ROCDL_ConcreteNonMemIntrOp<"med3.u16", [Pure], 1>, $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) }]; string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_umed3, {$src0, $src1, $src2}); + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_umed3, {$src0, $src1, $src2}, {moduleTranslation.convertType(op.getSrc0().getType())}); }]; } @@ -1375,7 +1375,7 @@ def ROCDL_Med3U32Op : ROCDL_ConcreteNonMemIntrOp<"med3.u32", [Pure], 1>, $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) }]; string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_umed3, {$src0, $src1, $src2}); + $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_umed3, {$src0, $src1, $src2}, {moduleTranslation.convertType(op.getSrc0().getType())}); }]; } >From 8267484a38a9d4ce5520adfb16dbb8d25d9aa77c Mon Sep 17 00:00:00 2001 From: keshavvinayak01 <keshavvinayak...@gmail.com> Date: Thu, 11 Sep 2025 11:56:06 +0000 Subject: [PATCH 3/5] Added smed3, umed3 amdgcn.& -> asm lits; Added rocdl -> llvm lowering lit Signed-off-by: keshavvinayak01 <keshavvinayak...@gmail.com> --- .../CodeGen/AMDGPU/llvm.amdgcn.smed3.i16.ll | 27 ++++++++++++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.ll | 42 +++++++++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.umed3.i16.ll | 27 ++++++++++++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.ll | 42 +++++++++++++++++++ mlir/test/Target/LLVMIR/rocdl.mlir | 42 +++++++++++++++++++ 5 files changed, 180 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.i16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.i16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.ll diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.i16.ll new file mode 100644 index 0000000000000..0f6f00309401c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.i16.ll @@ -0,0 +1,27 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_smed3_i16: +; GCN: v_med3_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @test_smed3_i16(ptr addrspace(1) %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 { + %src0.i16 = trunc i32 %src0.arg to i16 + %src1.i16 = trunc i32 %src1.arg to i16 + %src2.i16 = trunc i32 %src2.arg to i16 + %med3 = call i16 @llvm.amdgcn.smed3.i16(i16 %src0.i16, i16 %src1.i16, i16 %src2.i16) + store i16 %med3, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}test_smed3_zero_i16: +; GCN: v_med3_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 0 +define amdgpu_kernel void @test_smed3_zero_i16(ptr addrspace(1) %out, i32 %src0.arg, i32 %src1.arg) #1 { + %src0.i16 = trunc i32 %src0.arg to i16 + %src1.i16 = trunc i32 %src1.arg to i16 + %med3 = call i16 @llvm.amdgcn.smed3.i16(i16 %src0.i16, i16 %src1.i16, i16 0) + store i16 %med3, ptr addrspace(1) %out + ret void +} + +declare i16 @llvm.amdgcn.smed3.i16(i16, i16, i16) #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.ll new file mode 100644 index 0000000000000..250fdc0d2d78d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.ll @@ -0,0 +1,42 @@ +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_smed3: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @test_smed3(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %med3 = call i32 @llvm.amdgcn.smed3.i32(i32 %src0, i32 %src1, i32 %src2) + store i32 %med3, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}test_smed3_multi_use: +; GCN: v_med3_i32 [[MED3:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MED3]], s{{[0-9]+}} +define amdgpu_kernel void @test_smed3_multi_use(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2, i32 %mul.arg) #1 { + %med3 = call i32 @llvm.amdgcn.smed3.i32(i32 %src0, i32 %src1, i32 %src2) + %med3.user = mul i32 %med3, %mul.arg + store volatile i32 %med3.user, ptr addrspace(1) %out + store volatile i32 %med3, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}test_smed3_constants: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 42 +define amdgpu_kernel void @test_smed3_constants(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { + %med3 = call i32 @llvm.amdgcn.smed3.i32(i32 %src0, i32 %src1, i32 42) + store i32 %med3, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}test_smed3_zero: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 0 +define amdgpu_kernel void @test_smed3_zero(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { + %med3 = call i32 @llvm.amdgcn.smed3.i32(i32 %src0, i32 %src1, i32 0) + store i32 %med3, ptr addrspace(1) %out + ret void +} + +declare i32 @llvm.amdgcn.smed3.i32(i32, i32, i32) #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.i16.ll new file mode 100644 index 0000000000000..d484e8a4b0804 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.i16.ll @@ -0,0 +1,27 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_umed3_i16: +; GCN: v_med3_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @test_umed3_i16(ptr addrspace(1) %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 { + %src0.i16 = trunc i32 %src0.arg to i16 + %src1.i16 = trunc i32 %src1.arg to i16 + %src2.i16 = trunc i32 %src2.arg to i16 + %med3 = call i16 @llvm.amdgcn.umed3.i16(i16 %src0.i16, i16 %src1.i16, i16 %src2.i16) + store i16 %med3, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}test_umed3_zero_i16: +; GCN: v_med3_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 0 +define amdgpu_kernel void @test_umed3_zero_i16(ptr addrspace(1) %out, i32 %src0.arg, i32 %src1.arg) #1 { + %src0.i16 = trunc i32 %src0.arg to i16 + %src1.i16 = trunc i32 %src1.arg to i16 + %med3 = call i16 @llvm.amdgcn.umed3.i16(i16 %src0.i16, i16 %src1.i16, i16 0) + store i16 %med3, ptr addrspace(1) %out + ret void +} + +declare i16 @llvm.amdgcn.umed3.i16(i16, i16, i16) #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.ll new file mode 100644 index 0000000000000..e1bec276d1fb6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.ll @@ -0,0 +1,42 @@ +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_umed3: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @test_umed3(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %med3 = call i32 @llvm.amdgcn.umed3.i32(i32 %src0, i32 %src1, i32 %src2) + store i32 %med3, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}test_umed3_multi_use: +; GCN: v_med3_u32 [[MED3:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MED3]], s{{[0-9]+}} +define amdgpu_kernel void @test_umed3_multi_use(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2, i32 %mul.arg) #1 { + %med3 = call i32 @llvm.amdgcn.umed3.i32(i32 %src0, i32 %src1, i32 %src2) + %med3.user = mul i32 %med3, %mul.arg + store volatile i32 %med3.user, ptr addrspace(1) %out + store volatile i32 %med3, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}test_umed3_constants: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 42 +define amdgpu_kernel void @test_umed3_constants(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { + %med3 = call i32 @llvm.amdgcn.umed3.i32(i32 %src0, i32 %src1, i32 42) + store i32 %med3, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}test_umed3_zero: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 0 +define amdgpu_kernel void @test_umed3_zero(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { + %med3 = call i32 @llvm.amdgcn.umed3.i32(i32 %src0, i32 %src1, i32 0) + store i32 %med3, ptr addrspace(1) %out + ret void +} + +declare i32 @llvm.amdgcn.umed3.i32(i32, i32, i32) #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index a464358250c38..8ef4707db7bcf 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -1298,6 +1298,48 @@ llvm.func @rocdl_last_use(%ptr: !llvm.ptr<1>) -> i32 { llvm.return %ret : i32 } +llvm.func @test_med3_f16(%arg0: f16, %arg1: f16, %arg2: f16) -> f16 { + // CHECK-LABEL: define half @test_med3_f16(half %0, half %1, half %2) + %0 = rocdl.med3.f16 %arg0, %arg1, %arg2 : (f16, f16, f16) -> f16 + llvm.return %0 : f16 + // CHECK: call half @llvm.amdgcn.fmed3.f16(half %0, half %1, half %2) +} + +llvm.func @test_med3_f32(%arg0: f32, %arg1: f32, %arg2: f32) -> f32 { + // CHECK-LABEL: define float @test_med3_f32(float %0, float %1, float %2) + %0 = rocdl.med3.f32 %arg0, %arg1, %arg2 : (f32, f32, f32) -> f32 + llvm.return %0 : f32 + // CHECK: call float @llvm.amdgcn.fmed3.f32(float %0, float %1, float %2) +} + +llvm.func @test_med3_i16(%arg0: i16, %arg1: i16, %arg2: i16) -> i16 { + // CHECK-LABEL: define i16 @test_med3_i16(i16 %0, i16 %1, i16 %2) + %0 = rocdl.med3.i16 %arg0, %arg1, %arg2 : (i16, i16, i16) -> i16 + llvm.return %0 : i16 + // CHECK: call i16 @llvm.amdgcn.smed3.i16(i16 %0, i16 %1, i16 %2) +} + +llvm.func @test_med3_i32(%arg0: i32, %arg1: i32, %arg2: i32) -> i32 { + // CHECK-LABEL: define i32 @test_med3_i32(i32 %0, i32 %1, i32 %2) + %0 = rocdl.med3.i32 %arg0, %arg1, %arg2 : (i32, i32, i32) -> i32 + llvm.return %0 : i32 + // CHECK: call i32 @llvm.amdgcn.smed3.i32(i32 %0, i32 %1, i32 %2) +} + +llvm.func @test_med3_u16(%arg0: i16, %arg1: i16, %arg2: i16) -> i16 { + // CHECK-LABEL: define i16 @test_med3_u16(i16 %0, i16 %1, i16 %2) + %0 = rocdl.med3.u16 %arg0, %arg1, %arg2 : (i16, i16, i16) -> i16 + llvm.return %0 : i16 + // CHECK: call i16 @llvm.amdgcn.umed3.i16(i16 %0, i16 %1, i16 %2) +} + +llvm.func @test_med3_u32(%arg0: i32, %arg1: i32, %arg2: i32) -> i32 { + // CHECK-LABEL: define i32 @test_med3_u32(i32 %0, i32 %1, i32 %2) + %0 = rocdl.med3.u32 %arg0, %arg1, %arg2 : (i32, i32, i32) -> i32 + llvm.return %0 : i32 + // CHECK: call i32 @llvm.amdgcn.umed3.i32(i32 %0, i32 %1, i32 %2) +} + // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" } // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128" >From 49a0f2b29ab1d04096b82b4797bc676483389ae9 Mon Sep 17 00:00:00 2001 From: keshavvinayak01 <keshavvinayak...@gmail.com> Date: Mon, 15 Sep 2025 07:24:00 +0000 Subject: [PATCH 4/5] Removed smed3/umed3 intrinsics; only keeping rocdl.fmed3. Signed-off-by: keshavvinayak01 <keshavvinayak...@gmail.com> --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 - clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 8 - llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 12 -- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 2 - .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 172 ------------------ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 12 -- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 22 --- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 - .../CodeGen/AMDGPU/llvm.amdgcn.smed3.i16.ll | 27 --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.ll | 42 ----- .../CodeGen/AMDGPU/llvm.amdgcn.umed3.i16.ll | 27 --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.ll | 42 ----- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 56 ------ mlir/test/Target/LLVMIR/rocdl.mlir | 28 --- 14 files changed, 457 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.i16.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.i16.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 4f6ab2a36cd85..fda16e42d2c6b 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -139,8 +139,6 @@ BUILTIN(__builtin_amdgcn_ds_bpermute, "iii", "nc") BUILTIN(__builtin_amdgcn_readfirstlane, "ii", "nc") BUILTIN(__builtin_amdgcn_readlane, "iii", "nc") BUILTIN(__builtin_amdgcn_fmed3f, "ffff", "nc") -BUILTIN(__builtin_amdgcn_smed3, "iiii", "nc") -BUILTIN(__builtin_amdgcn_umed3, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_ds_faddf, "ff*3fIiIiIb", "n") BUILTIN(__builtin_amdgcn_ds_fminf, "ff*3fIiIiIb", "n") BUILTIN(__builtin_amdgcn_ds_fmaxf, "ff*3fIiIiIb", "n") @@ -267,8 +265,6 @@ TARGET_BUILTIN(__builtin_amdgcn_perm, "UiUiUiUi", "nc", "gfx8-insts") //===----------------------------------------------------------------------===// TARGET_BUILTIN(__builtin_amdgcn_fmed3h, "hhhh", "nc", "gfx9-insts") -TARGET_BUILTIN(__builtin_amdgcn_smed3h, "ssss", "nc", "gfx9-insts") -TARGET_BUILTIN(__builtin_amdgcn_umed3h, "UsUsUsUs", "nc", "gfx9-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f64, "dd*1d", "t", "gfx90a-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1f", "t", "atomic-fadd-rtn-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 5d4c980c7c63e..07cf08c54985a 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -606,14 +606,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_fmed3h: return emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::amdgcn_fmed3); - case AMDGPU::BI__builtin_amdgcn_smed3: - case AMDGPU::BI__builtin_amdgcn_smed3h: - return emitBuiltinWithOneOverloadedType<3>(*this, E, - Intrinsic::amdgcn_smed3); - case AMDGPU::BI__builtin_amdgcn_umed3: - case AMDGPU::BI__builtin_amdgcn_umed3h: - return emitBuiltinWithOneOverloadedType<3>(*this, E, - Intrinsic::amdgcn_umed3); case AMDGPU::BI__builtin_amdgcn_ds_append: case AMDGPU::BI__builtin_amdgcn_ds_consume: { Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ? diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 52ba06ed4be25..030d01d7a5f3f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -543,18 +543,6 @@ def int_amdgcn_fmed3 : [IntrNoMem, IntrSpeculatable] >; -def int_amdgcn_smed3 : - DefaultAttrsIntrinsic<[llvm_anyint_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable] ->; - -def int_amdgcn_umed3 : - DefaultAttrsIntrinsic<[llvm_anyint_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable] ->; - def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">, DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 486ec90edcaef..fceda65d5bf27 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -256,8 +256,6 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>; def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>; def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>; -def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3_impl>; -def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3_impl>; def : GINodeEquiv<G_AMDGPU_FMED3, AMDGPUfmed3_impl>; def : GINodeEquiv<G_AMDGPU_CLAMP, AMDGPUclamp>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index c6cb4736f95df..4fe5d00679436 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -60,26 +60,6 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, return maxnum(Src0, Src1); } -// Constant fold llvm.amdgcn.smed3 intrinsics for standard inputs. -static APInt smed3AMDGCN(const APInt &Src0, const APInt &Src1, const APInt &Src2) { - APInt Max3 = Src0.sgt(Src1) ? (Src0.sgt(Src2) ? Src0 : Src2) - : (Src1.sgt(Src2) ? Src1 : Src2); - - if (Max3 == Src0) return Src1.sgt(Src2) ? Src1 : Src2; - if (Max3 == Src1) return Src0.sgt(Src2) ? Src0 : Src2; - return Src0.sgt(Src1) ? Src0 : Src1; -} - -// Constant fold llvm.amdgcn.umed3 intrinsics for standard inputs. -static APInt umed3AMDGCN(const APInt &Src0, const APInt &Src1, const APInt &Src2) { - APInt Max3 = Src0.ugt(Src1) ? (Src0.ugt(Src2) ? Src0 : Src2) - : (Src1.ugt(Src2) ? Src1 : Src2); - - if (Max3 == Src0) return Src1.ugt(Src2) ? Src1 : Src2; - if (Max3 == Src1) return Src0.ugt(Src2) ? Src0 : Src2; - return Src0.ugt(Src1) ? Src0 : Src1; -} - // Check if a value can be converted to a 16-bit value without losing // precision. // The value is expected to be either a float (IsFloat = true) or an unsigned @@ -447,36 +427,6 @@ static Value *matchFPExtFromF16(Value *Arg) { return nullptr; } -/// Match an sext from i16 to i32, or a constant we can convert. -static Value *matchSExtFromI16(Value *Arg) { - Value *Src = nullptr; - ConstantInt *CInt = nullptr; - if (match(Arg, m_OneUse(m_SExt(m_Value(Src))))) { - if (Src->getType()->isIntegerTy(16)) - return Src; - } else if (match(Arg, m_ConstantInt(CInt))) { - // Check if the constant fits in i16 - if (CInt->getValue().getActiveBits() <= 16) - return ConstantInt::get(Type::getInt16Ty(Arg->getContext()), CInt->getValue().trunc(16)); - } - return nullptr; -} - -/// Match a zext from i16 to i32, or a constant we can convert. -static Value *matchZExtFromI16(Value *Arg) { - Value *Src = nullptr; - ConstantInt *CInt = nullptr; - if (match(Arg, m_OneUse(m_ZExt(m_Value(Src))))) { - if (Src->getType()->isIntegerTy(16)) - return Src; - } else if (match(Arg, m_ConstantInt(CInt))) { - // Check if the constant fits in i16 - if (CInt->getValue().getActiveBits() <= 16) - return ConstantInt::get(Type::getInt16Ty(Arg->getContext()), CInt->getValue().trunc(16)); - } - return nullptr; -} - // Trim all zero components from the end of the vector \p UseV and return // an appropriate bitset with known elements. static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, @@ -1224,128 +1174,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } - case Intrinsic::amdgcn_smed3: { - Value *Src0 = II.getArgOperand(0); - Value *Src1 = II.getArgOperand(1); - Value *Src2 = II.getArgOperand(2); - - // Propagate poison values. - for (Value *Src : {Src0, Src1, Src2}) { - if (isa<PoisonValue>(Src)) - return IC.replaceInstUsesWith(II, Src); - } - - bool Swap = false; - // Canonicalize constants to RHS operands. - // - // smed3(c0, x, c1) -> smed3(x, c0, c1) - if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { - std::swap(Src0, Src1); - Swap = true; - } - - if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { - std::swap(Src1, Src2); - Swap = true; - } - - if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { - std::swap(Src0, Src1); - Swap = true; - } - - if (Swap) { - II.setArgOperand(0, Src0); - II.setArgOperand(1, Src1); - II.setArgOperand(2, Src2); - return &II; - } - - // Constant fold smed3 with constant operands. - if (const ConstantInt *C0 = dyn_cast<ConstantInt>(Src0)) { - if (const ConstantInt *C1 = dyn_cast<ConstantInt>(Src1)) { - if (const ConstantInt *C2 = dyn_cast<ConstantInt>(Src2)) { - APInt Result = smed3AMDGCN(C0->getValue(), C1->getValue(), C2->getValue()); - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); - } - } - } - - // Width reduction for integer extensions. - // smed3((sext X), (sext Y), (sext Z)) -> sext (smed3(X, Y, Z)) - if (Value *X = matchSExtFromI16(Src0)) { - if (Value *Y = matchSExtFromI16(Src1)) { - if (Value *Z = matchSExtFromI16(Src2)) { - Value *NewCall = IC.Builder.CreateIntrinsic( - IID, {X->getType()}, {X, Y, Z}, &II, II.getName()); - return new SExtInst(NewCall, II.getType()); - } - } - } - - break; - } - case Intrinsic::amdgcn_umed3: { - Value *Src0 = II.getArgOperand(0); - Value *Src1 = II.getArgOperand(1); - Value *Src2 = II.getArgOperand(2); - - // Propagate poison values. - for (Value *Src : {Src0, Src1, Src2}) { - if (isa<PoisonValue>(Src)) - return IC.replaceInstUsesWith(II, Src); - } - - bool Swap = false; - // Canonicalize constants to RHS operands. - // - // umed3(c0, x, c1) -> umed3(x, c0, c1) - if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { - std::swap(Src0, Src1); - Swap = true; - } - - if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { - std::swap(Src1, Src2); - Swap = true; - } - - if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { - std::swap(Src0, Src1); - Swap = true; - } - - if (Swap) { - II.setArgOperand(0, Src0); - II.setArgOperand(1, Src1); - II.setArgOperand(2, Src2); - return &II; - } - - // Constant fold umed3 with constant operands. - if (const ConstantInt *C0 = dyn_cast<ConstantInt>(Src0)) { - if (const ConstantInt *C1 = dyn_cast<ConstantInt>(Src1)) { - if (const ConstantInt *C2 = dyn_cast<ConstantInt>(Src2)) { - APInt Result = umed3AMDGCN(C0->getValue(), C1->getValue(), C2->getValue()); - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); - } - } - } - - // Width reduction for integer extensions. - // umed3((zext X), (zext Y), (zext Z)) -> zext (umed3(X, Y, Z)) - if (Value *X = matchZExtFromI16(Src0)) { - if (Value *Y = matchZExtFromI16(Src1)) { - if (Value *Z = matchZExtFromI16(Src2)) { - Value *NewCall = IC.Builder.CreateIntrinsic( - IID, {X->getType()}, {X, Y, Z}, &II, II.getName()); - return new ZExtInst(NewCall, II.getType()); - } - } - } - - break; - } case Intrinsic::amdgcn_icmp: case Intrinsic::amdgcn_fcmp: { const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index e9680e062cffa..44825179c997b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -337,10 +337,6 @@ def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; -def AMDGPUsmed3_impl : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp, []>; - -def AMDGPUumed3_impl : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, []>; - def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2", SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, SDTCisFP<0>, SDTCisVec<1>, @@ -445,14 +441,6 @@ def AMDGPUfmed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2), [(int_amdgcn_fmed3 node:$src0, node:$src1, node:$src2), (AMDGPUfmed3_impl node:$src0, node:$src1, node:$src2)]>; -def AMDGPUsmed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2), - [(int_amdgcn_smed3 node:$src0, node:$src1, node:$src2), - (AMDGPUsmed3_impl node:$src0, node:$src1, node:$src2)]>; - -def AMDGPUumed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2), - [(int_amdgcn_umed3 node:$src0, node:$src1, node:$src2), - (AMDGPUumed3_impl node:$src0, node:$src1, node:$src2)]>; - def AMDGPUdiv_fixup : PatFrags<(ops node:$src0, node:$src1, node:$src2), [(int_amdgcn_div_fixup node:$src0, node:$src1, node:$src2), (AMDGPUdiv_fixup_impl node:$src0, node:$src1, node:$src2)]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 5da1e04c58bae..f18536cd4ab93 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -7798,28 +7798,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, Observer.changedInstr(MI); return true; } - case Intrinsic::amdgcn_smed3: { - GISelChangeObserver &Observer = Helper.Observer; - - // FIXME: This is to workaround the inability of tablegen match combiners to - // match intrinsics in patterns. - Observer.changingInstr(MI); - MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_SMED3)); - MI.removeOperand(1); - Observer.changedInstr(MI); - return true; - } - case Intrinsic::amdgcn_umed3: { - GISelChangeObserver &Observer = Helper.Observer; - - // FIXME: This is to workaround the inability of tablegen match combiners to - // match intrinsics in patterns. - Observer.changingInstr(MI); - MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_UMED3)); - MI.removeOperand(1); - Observer.changedInstr(MI); - return true; - } case Intrinsic::amdgcn_readlane: case Intrinsic::amdgcn_writelane: case Intrinsic::amdgcn_readfirstlane: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 63141d065bf65..36b27bef350ed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4136,7 +4136,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: case AMDGPU::G_AMDGPU_SMED3: - case AMDGPU::G_AMDGPU_UMED3: case AMDGPU::G_AMDGPU_FMED3: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: @@ -4661,8 +4660,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16: case Intrinsic::amdgcn_sat_pk4_i4_i8: case Intrinsic::amdgcn_sat_pk4_u4_u8: - case Intrinsic::amdgcn_smed3: - case Intrinsic::amdgcn_umed3: case Intrinsic::amdgcn_fmed3: case Intrinsic::amdgcn_cubeid: case Intrinsic::amdgcn_cubema: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.i16.ll deleted file mode 100644 index 0f6f00309401c..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.i16.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s - -; GCN-LABEL: {{^}}test_smed3_i16: -; GCN: v_med3_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @test_smed3_i16(ptr addrspace(1) %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 { - %src0.i16 = trunc i32 %src0.arg to i16 - %src1.i16 = trunc i32 %src1.arg to i16 - %src2.i16 = trunc i32 %src2.arg to i16 - %med3 = call i16 @llvm.amdgcn.smed3.i16(i16 %src0.i16, i16 %src1.i16, i16 %src2.i16) - store i16 %med3, ptr addrspace(1) %out - ret void -} - -; GCN-LABEL: {{^}}test_smed3_zero_i16: -; GCN: v_med3_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 0 -define amdgpu_kernel void @test_smed3_zero_i16(ptr addrspace(1) %out, i32 %src0.arg, i32 %src1.arg) #1 { - %src0.i16 = trunc i32 %src0.arg to i16 - %src1.i16 = trunc i32 %src1.arg to i16 - %med3 = call i16 @llvm.amdgcn.smed3.i16(i16 %src0.i16, i16 %src1.i16, i16 0) - store i16 %med3, ptr addrspace(1) %out - ret void -} - -declare i16 @llvm.amdgcn.smed3.i16(i16, i16, i16) #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.ll deleted file mode 100644 index 250fdc0d2d78d..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.ll +++ /dev/null @@ -1,42 +0,0 @@ -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s - -; GCN-LABEL: {{^}}test_smed3: -; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @test_smed3(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 { - %med3 = call i32 @llvm.amdgcn.smed3.i32(i32 %src0, i32 %src1, i32 %src2) - store i32 %med3, ptr addrspace(1) %out - ret void -} - -; GCN-LABEL: {{^}}test_smed3_multi_use: -; GCN: v_med3_i32 [[MED3:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MED3]], s{{[0-9]+}} -define amdgpu_kernel void @test_smed3_multi_use(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2, i32 %mul.arg) #1 { - %med3 = call i32 @llvm.amdgcn.smed3.i32(i32 %src0, i32 %src1, i32 %src2) - %med3.user = mul i32 %med3, %mul.arg - store volatile i32 %med3.user, ptr addrspace(1) %out - store volatile i32 %med3, ptr addrspace(1) %out - ret void -} - -; GCN-LABEL: {{^}}test_smed3_constants: -; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 42 -define amdgpu_kernel void @test_smed3_constants(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { - %med3 = call i32 @llvm.amdgcn.smed3.i32(i32 %src0, i32 %src1, i32 42) - store i32 %med3, ptr addrspace(1) %out - ret void -} - -; GCN-LABEL: {{^}}test_smed3_zero: -; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 0 -define amdgpu_kernel void @test_smed3_zero(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { - %med3 = call i32 @llvm.amdgcn.smed3.i32(i32 %src0, i32 %src1, i32 0) - store i32 %med3, ptr addrspace(1) %out - ret void -} - -declare i32 @llvm.amdgcn.smed3.i32(i32, i32, i32) #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.i16.ll deleted file mode 100644 index d484e8a4b0804..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.i16.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s - -; GCN-LABEL: {{^}}test_umed3_i16: -; GCN: v_med3_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @test_umed3_i16(ptr addrspace(1) %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 { - %src0.i16 = trunc i32 %src0.arg to i16 - %src1.i16 = trunc i32 %src1.arg to i16 - %src2.i16 = trunc i32 %src2.arg to i16 - %med3 = call i16 @llvm.amdgcn.umed3.i16(i16 %src0.i16, i16 %src1.i16, i16 %src2.i16) - store i16 %med3, ptr addrspace(1) %out - ret void -} - -; GCN-LABEL: {{^}}test_umed3_zero_i16: -; GCN: v_med3_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 0 -define amdgpu_kernel void @test_umed3_zero_i16(ptr addrspace(1) %out, i32 %src0.arg, i32 %src1.arg) #1 { - %src0.i16 = trunc i32 %src0.arg to i16 - %src1.i16 = trunc i32 %src1.arg to i16 - %med3 = call i16 @llvm.amdgcn.umed3.i16(i16 %src0.i16, i16 %src1.i16, i16 0) - store i16 %med3, ptr addrspace(1) %out - ret void -} - -declare i16 @llvm.amdgcn.umed3.i16(i16, i16, i16) #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.ll deleted file mode 100644 index e1bec276d1fb6..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.umed3.ll +++ /dev/null @@ -1,42 +0,0 @@ -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s - -; GCN-LABEL: {{^}}test_umed3: -; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @test_umed3(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 { - %med3 = call i32 @llvm.amdgcn.umed3.i32(i32 %src0, i32 %src1, i32 %src2) - store i32 %med3, ptr addrspace(1) %out - ret void -} - -; GCN-LABEL: {{^}}test_umed3_multi_use: -; GCN: v_med3_u32 [[MED3:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MED3]], s{{[0-9]+}} -define amdgpu_kernel void @test_umed3_multi_use(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2, i32 %mul.arg) #1 { - %med3 = call i32 @llvm.amdgcn.umed3.i32(i32 %src0, i32 %src1, i32 %src2) - %med3.user = mul i32 %med3, %mul.arg - store volatile i32 %med3.user, ptr addrspace(1) %out - store volatile i32 %med3, ptr addrspace(1) %out - ret void -} - -; GCN-LABEL: {{^}}test_umed3_constants: -; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 42 -define amdgpu_kernel void @test_umed3_constants(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { - %med3 = call i32 @llvm.amdgcn.umed3.i32(i32 %src0, i32 %src1, i32 42) - store i32 %med3, ptr addrspace(1) %out - ret void -} - -; GCN-LABEL: {{^}}test_umed3_zero: -; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 0 -define amdgpu_kernel void @test_umed3_zero(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { - %med3 = call i32 @llvm.amdgcn.umed3.i32(i32 %src0, i32 %src1, i32 0) - store i32 %med3, ptr addrspace(1) %out - ret void -} - -declare i32 @llvm.amdgcn.umed3.i32(i32, i32, i32) #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 8762f5e2c73a3..e308d601b1c6b 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -1323,62 +1323,6 @@ def ROCDL_Med3F32Op : ROCDL_ConcreteNonMemIntrOp<"med3.f32", [Pure], 1>, }]; } -def ROCDL_Med3I16Op : ROCDL_ConcreteNonMemIntrOp<"med3.i16", [Pure], 1>, - Arguments<(ins I16:$src0, - I16:$src1, - I16:$src2)> { - let results = (outs I16:$res); - let summary = "Median of three signed 16-bit integer values"; - let assemblyFormat = [{ - $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) - }]; - string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_smed3, {$src0, $src1, $src2}, {moduleTranslation.convertType(op.getSrc0().getType())}); - }]; -} - -def ROCDL_Med3I32Op : ROCDL_ConcreteNonMemIntrOp<"med3.i32", [Pure], 1>, - Arguments<(ins I32:$src0, - I32:$src1, - I32:$src2)> { - let results = (outs I32:$res); - let summary = "Median of three signed 32-bit integer values"; - let assemblyFormat = [{ - $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) - }]; - string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_smed3, {$src0, $src1, $src2}, {moduleTranslation.convertType(op.getSrc0().getType())}); - }]; -} - -def ROCDL_Med3U16Op : ROCDL_ConcreteNonMemIntrOp<"med3.u16", [Pure], 1>, - Arguments<(ins I16:$src0, - I16:$src1, - I16:$src2)> { - let results = (outs I16:$res); - let summary = "Median of three unsigned 16-bit integer values"; - let assemblyFormat = [{ - $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) - }]; - string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_umed3, {$src0, $src1, $src2}, {moduleTranslation.convertType(op.getSrc0().getType())}); - }]; -} - -def ROCDL_Med3U32Op : ROCDL_ConcreteNonMemIntrOp<"med3.u32", [Pure], 1>, - Arguments<(ins I32:$src0, - I32:$src1, - I32:$src2)> { - let results = (outs I32:$res); - let summary = "Median of three unsigned 32-bit integer values"; - let assemblyFormat = [{ - $src0 `,` $src1 `,` $src2 attr-dict `:` `(` type($src0) `,` type($src1) `,` type($src2) `)` `->` type($res) - }]; - string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_umed3, {$src0, $src1, $src2}, {moduleTranslation.convertType(op.getSrc0().getType())}); - }]; -} - //===----------------------------------------------------------------------===// // ROCDL target attribute. //===----------------------------------------------------------------------===// diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 8ef4707db7bcf..d4871b67c8724 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -1312,34 +1312,6 @@ llvm.func @test_med3_f32(%arg0: f32, %arg1: f32, %arg2: f32) -> f32 { // CHECK: call float @llvm.amdgcn.fmed3.f32(float %0, float %1, float %2) } -llvm.func @test_med3_i16(%arg0: i16, %arg1: i16, %arg2: i16) -> i16 { - // CHECK-LABEL: define i16 @test_med3_i16(i16 %0, i16 %1, i16 %2) - %0 = rocdl.med3.i16 %arg0, %arg1, %arg2 : (i16, i16, i16) -> i16 - llvm.return %0 : i16 - // CHECK: call i16 @llvm.amdgcn.smed3.i16(i16 %0, i16 %1, i16 %2) -} - -llvm.func @test_med3_i32(%arg0: i32, %arg1: i32, %arg2: i32) -> i32 { - // CHECK-LABEL: define i32 @test_med3_i32(i32 %0, i32 %1, i32 %2) - %0 = rocdl.med3.i32 %arg0, %arg1, %arg2 : (i32, i32, i32) -> i32 - llvm.return %0 : i32 - // CHECK: call i32 @llvm.amdgcn.smed3.i32(i32 %0, i32 %1, i32 %2) -} - -llvm.func @test_med3_u16(%arg0: i16, %arg1: i16, %arg2: i16) -> i16 { - // CHECK-LABEL: define i16 @test_med3_u16(i16 %0, i16 %1, i16 %2) - %0 = rocdl.med3.u16 %arg0, %arg1, %arg2 : (i16, i16, i16) -> i16 - llvm.return %0 : i16 - // CHECK: call i16 @llvm.amdgcn.umed3.i16(i16 %0, i16 %1, i16 %2) -} - -llvm.func @test_med3_u32(%arg0: i32, %arg1: i32, %arg2: i32) -> i32 { - // CHECK-LABEL: define i32 @test_med3_u32(i32 %0, i32 %1, i32 %2) - %0 = rocdl.med3.u32 %arg0, %arg1, %arg2 : (i32, i32, i32) -> i32 - llvm.return %0 : i32 - // CHECK: call i32 @llvm.amdgcn.umed3.i32(i32 %0, i32 %1, i32 %2) -} - // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" } // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128" >From 86441ab3cbc74b8bbe917d9f125aaa179cdb4e66 Mon Sep 17 00:00:00 2001 From: keshavvinayak01 <keshavvinayak...@gmail.com> Date: Mon, 15 Sep 2025 07:27:15 +0000 Subject: [PATCH 5/5] Added back old InstrInfo/GISel for smed/umed Signed-off-by: keshavvinayak01 <keshavvinayak...@gmail.com> --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 2 ++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index fceda65d5bf27..bb4bf742fb861 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -256,6 +256,8 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>; def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>; def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>; +def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>; +def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>; def : GINodeEquiv<G_AMDGPU_FMED3, AMDGPUfmed3_impl>; def : GINodeEquiv<G_AMDGPU_CLAMP, AMDGPUclamp>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 44825179c997b..b8fa6f3fc6867 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -334,6 +334,13 @@ def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, [] >; +def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, + [] +>; def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits