https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/194809
>From dd23131ba0b7d2b15635cb8da330704ee791f398 Mon Sep 17 00:00:00 2001 From: Aaditya <[email protected]> Date: Tue, 28 Apr 2026 13:32:36 +0530 Subject: [PATCH 1/2] [AMDGPU] Support Wave Reduction for true-16 types - 1 Supporting true-16 versions of the reduction intrinsics Supported Ops: `min`, `umin`, `max`, `umax`. Supports only the iterative stratergy, DPP is yet to be supported. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 25 ++- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll | 152 +++++++++++------ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll | 153 ++++++++++++------ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 143 +++++++++++++--- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 152 +++++++++++------ 6 files changed, 445 insertions(+), 186 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 57d89a4aff836..5e377703b357e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6060,6 +6060,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, bool needsSignExtension = MI.getOpcode() == AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I16 || MI.getOpcode() == AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I16; + bool useRealTrue16 = ST.useRealTrue16Insts(); // Create virtual registers required for lowering. const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); @@ -6152,7 +6153,6 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1; bool hasOMod = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1; - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), LaneValueReg) .addReg(SrcReg) @@ -6712,11 +6712,13 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, const DebugLoc &DL = MI.getDebugLoc(); switch (MI.getOpcode()) { + case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U16_t16: case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U16: case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64); + case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I16_t16: case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I16: case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32); @@ -6729,11 +6731,13 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? AMDGPU::V_MIN_NUM_F64_e64 : AMDGPU::V_MIN_F64_e64); + case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U16_t16: case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U16: case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64); + case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I16_t16: case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I16: case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 42a99233132cb..2d0d9ce0ea1b7 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -350,8 +350,10 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), // clang-format off multiclass - AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> { - let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, UseNamedOperandTable = 1, Uses = [EXEC] in { + AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, + SrcRegOrImm9 Reg, True16PredicateClass T16Pred = NoTrue16Predicate> { + let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, + UseNamedOperandTable = 1, Uses = [EXEC], True16Predicate = T16Pred in { def !toupper(Op) #"_PSEUDO_" #DataType : VPseudoInstSI<(outs RetReg : $sdst), (ins Reg : $src, i32imm : $strategy), @@ -361,12 +363,14 @@ multiclass // clang-format on class WaveReduceOp<string OpName, string TypeStr, ValueType Ty, - RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC> { + RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC, + True16PredicateClass T16Pred = NoTrue16Predicate> { string Name = OpName; string TypeString = TypeStr; ValueType VT = Ty; RegisterClass RetReg = ReturnRegisterClass; SrcRegOrImm9 Reg = RC; + True16PredicateClass T16Predicate = T16Pred; } // Input list : [Operation_name, @@ -404,15 +408,20 @@ defvar Operations = [ WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>, WaveReduceOp<"fsub", "F64", f64, SGPR_64, VSrc_b64>, - WaveReduceOp<"umin", "U16", i16, SGPR_32, VSrc_b16>, - WaveReduceOp<"min", "I16", i16, SGPR_32, VSrc_b16>, - WaveReduceOp<"umax", "U16", i16, SGPR_32, VSrc_b16>, - WaveReduceOp<"max", "I16", i16, SGPR_32, VSrc_b16> + WaveReduceOp<"umin", "U16", i16, SGPR_32, VSrc_b16, NotUseRealTrue16Insts>, + WaveReduceOp<"min", "I16", i16, SGPR_32, VSrc_b16, NotUseRealTrue16Insts>, + WaveReduceOp<"umax", "U16", i16, SGPR_32, VSrc_b16, NotUseRealTrue16Insts>, + WaveReduceOp<"max", "I16", i16, SGPR_32, VSrc_b16, NotUseRealTrue16Insts>, + WaveReduceOp<"umin", "U16_t16", i16, SGPR_32, VSrcT_b16, UseRealTrue16Insts>, + WaveReduceOp<"min", "I16_t16", i16, SGPR_32, VSrcT_b16, UseRealTrue16Insts>, + WaveReduceOp<"umax", "U16_t16", i16, SGPR_32, VSrcT_b16, UseRealTrue16Insts>, + WaveReduceOp<"max", "I16_t16", i16, SGPR_32, VSrcT_b16, UseRealTrue16Insts> ]; foreach Op = Operations in { defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op.Name, Op.TypeString, - Op.VT, Op.RetReg, Op.Reg>; + Op.VT, Op.RetReg, Op.Reg, + Op.T16Predicate>; } let usesCustomInserter = 1, Defs = [VCC] in { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll index c42ab7fe702df..5ecbbe86191c7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll @@ -7,10 +7,14 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-TRUE16 %s define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX8DAGISEL-LABEL: uniform_value_i16: @@ -80,52 +84,100 @@ define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX10GISEL-NEXT: global_store_short v1, v0, s[0:1] ; GFX10GISEL-NEXT: s_endpgm ; -; GFX1164DAGISEL-LABEL: uniform_value_i16: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1164DAGISEL-NEXT: s_endpgm -; -; GFX1164GISEL-LABEL: uniform_value_i16: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_clause 0x1 -; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1164GISEL-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX1164GISEL-NEXT: s_endpgm -; -; GFX1132DAGISEL-LABEL: uniform_value_i16: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX1132DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1132DAGISEL-NEXT: s_endpgm -; -; GFX1132GISEL-LABEL: uniform_value_i16: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_clause 0x1 -; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1132GISEL-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX1132GISEL-NEXT: s_endpgm +; GFX1164DAGISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1164DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1164DAGISEL-FAKE16-NEXT: s_endpgm +; +; GFX1164GISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164GISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1164GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164GISEL-FAKE16-NEXT: s_endpgm +; +; GFX1132DAGISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1132DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1132DAGISEL-FAKE16-NEXT: s_endpgm +; +; GFX1132GISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132GISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1132GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132GISEL-FAKE16-NEXT: s_endpgm +; +; GFX1164DAGISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1164DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164DAGISEL-TRUE16-NEXT: s_endpgm +; +; GFX1164GISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164GISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1164GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-TRUE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164GISEL-TRUE16-NEXT: s_endpgm +; +; GFX1132DAGISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1132DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132DAGISEL-TRUE16-NEXT: s_endpgm +; +; GFX1132GISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132GISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1132GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-TRUE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132GISEL-TRUE16-NEXT: s_endpgm entry: %result = call i16 @llvm.amdgcn.wave.reduce.max.i16(i16 %in, i32 1) store i16 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll index 66c19da5d5dbc..4c9a44c2edf18 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll @@ -7,11 +7,14 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s -; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s -; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s -; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s -; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s - +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-TRUE16 %s define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX8DAGISEL-LABEL: uniform_value_i16: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -80,52 +83,100 @@ define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX10GISEL-NEXT: global_store_short v1, v0, s[0:1] ; GFX10GISEL-NEXT: s_endpgm ; -; GFX1164DAGISEL-LABEL: uniform_value_i16: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1164DAGISEL-NEXT: s_endpgm -; -; GFX1164GISEL-LABEL: uniform_value_i16: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_clause 0x1 -; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1164GISEL-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX1164GISEL-NEXT: s_endpgm -; -; GFX1132DAGISEL-LABEL: uniform_value_i16: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX1132DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1132DAGISEL-NEXT: s_endpgm -; -; GFX1132GISEL-LABEL: uniform_value_i16: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_clause 0x1 -; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1132GISEL-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX1132GISEL-NEXT: s_endpgm +; GFX1164DAGISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1164DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1164DAGISEL-FAKE16-NEXT: s_endpgm +; +; GFX1164GISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164GISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1164GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164GISEL-FAKE16-NEXT: s_endpgm +; +; GFX1132DAGISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1132DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1132DAGISEL-FAKE16-NEXT: s_endpgm +; +; GFX1132GISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132GISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1132GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132GISEL-FAKE16-NEXT: s_endpgm +; +; GFX1164DAGISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1164DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164DAGISEL-TRUE16-NEXT: s_endpgm +; +; GFX1164GISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164GISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1164GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-TRUE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164GISEL-TRUE16-NEXT: s_endpgm +; +; GFX1132DAGISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1132DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132DAGISEL-TRUE16-NEXT: s_endpgm +; +; GFX1132GISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132GISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1132GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-TRUE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132GISEL-TRUE16-NEXT: s_endpgm entry: %result = call i16 @llvm.amdgcn.wave.reduce.min.i16(i16 %in, i32 1) store i16 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index c729d1119da02..5b49959869428 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -7,10 +7,14 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-TRUE16 %s define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX8DAGISEL-LABEL: uniform_value_i16: @@ -80,17 +84,100 @@ define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX10GISEL-NEXT: global_store_short v1, v0, s[0:1] ; GFX10GISEL-NEXT: s_endpgm ; -; GFX1164DAGISEL-LABEL: uniform_value_i16: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1164DAGISEL-NEXT: s_endpgm -; +; GFX1164DAGISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1164DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1164DAGISEL-FAKE16-NEXT: s_endpgm +; +; GFX1164GISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164GISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1164GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164GISEL-FAKE16-NEXT: s_endpgm +; +; GFX1132DAGISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1132DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1132DAGISEL-FAKE16-NEXT: s_endpgm +; +; GFX1132GISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132GISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1132GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132GISEL-FAKE16-NEXT: s_endpgm +; +; GFX1164DAGISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1164DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164DAGISEL-TRUE16-NEXT: s_endpgm +; +; GFX1164GISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164GISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1164GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-TRUE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164GISEL-TRUE16-NEXT: s_endpgm +; +; GFX1132DAGISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1132DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132DAGISEL-TRUE16-NEXT: s_endpgm +; +; GFX1132GISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132GISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1132GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-TRUE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132GISEL-TRUE16-NEXT: s_endpgm ; GFX11GISEL-LABEL: uniform_value_i16: ; GFX11GISEL: ; %bb.0: ; %entry ; GFX11GISEL-NEXT: s_clause 0x1 @@ -103,16 +190,6 @@ define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX11GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11GISEL-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11GISEL-NEXT: s_endpgm -; -; GFX1132DAGISEL-LABEL: uniform_value_i16: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX1132DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1132DAGISEL-NEXT: s_endpgm entry: %result = call i16 @llvm.amdgcn.wave.reduce.umax.i16(i16 %in, i32 1) store i16 %result, ptr addrspace(1) %out @@ -3116,10 +3193,24 @@ define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX10GISEL: ; %bb.0: ; %entry ; GFX10GISEL-NEXT: s_endpgm ; +; GFX1164DAGISEL-LABEL: poison_value_i64: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: poison_value_i64: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: poison_value_i64: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: poison_value_i64: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_endpgm ; GFX11DAGISEL-LABEL: poison_value_i64: ; GFX11DAGISEL: ; %bb.0: ; %entry ; GFX11DAGISEL-NEXT: s_endpgm -; ; GFX11GISEL-LABEL: poison_value_i64: ; GFX11GISEL: ; %bb.0: ; %entry ; GFX11GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 6204232b7e7f4..4a9fde954573c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -7,10 +7,14 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-TRUE16 %s define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX8DAGISEL-LABEL: uniform_value_i16: @@ -80,52 +84,100 @@ define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX10GISEL-NEXT: global_store_short v1, v0, s[0:1] ; GFX10GISEL-NEXT: s_endpgm ; -; GFX1164DAGISEL-LABEL: uniform_value_i16: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1164DAGISEL-NEXT: s_endpgm -; -; GFX1164GISEL-LABEL: uniform_value_i16: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_clause 0x1 -; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1164GISEL-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX1164GISEL-NEXT: s_endpgm -; -; GFX1132DAGISEL-LABEL: uniform_value_i16: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX1132DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1132DAGISEL-NEXT: s_endpgm -; -; GFX1132GISEL-LABEL: uniform_value_i16: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_clause 0x1 -; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1132GISEL-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX1132GISEL-NEXT: s_endpgm +; GFX1164DAGISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1164DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1164DAGISEL-FAKE16-NEXT: s_endpgm +; +; GFX1164GISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164GISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1164GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164GISEL-FAKE16-NEXT: s_endpgm +; +; GFX1132DAGISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1132DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1132DAGISEL-FAKE16-NEXT: s_endpgm +; +; GFX1132GISEL-FAKE16-LABEL: uniform_value_i16: +; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132GISEL-FAKE16-NEXT: s_clause 0x1 +; GFX1132GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132GISEL-FAKE16-NEXT: s_endpgm +; +; GFX1164DAGISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1164DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164DAGISEL-TRUE16-NEXT: s_endpgm +; +; GFX1164GISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164GISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1164GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-TRUE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164GISEL-TRUE16-NEXT: s_endpgm +; +; GFX1132DAGISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1132DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132DAGISEL-TRUE16-NEXT: s_endpgm +; +; GFX1132GISEL-TRUE16-LABEL: uniform_value_i16: +; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132GISEL-TRUE16-NEXT: s_clause 0x1 +; GFX1132GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-TRUE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132GISEL-TRUE16-NEXT: s_endpgm entry: %result = call i16 @llvm.amdgcn.wave.reduce.umin.i16(i16 %in, i32 1) store i16 %result, ptr addrspace(1) %out >From b4be9d54dbf69c6c95638ed8f076ce9673913ff7 Mon Sep 17 00:00:00 2001 From: Aaditya <[email protected]> Date: Mon, 4 May 2026 14:07:53 +0530 Subject: [PATCH 2/2] Use `REG_SEQUENCE` instead of `COPY` Use SALU opcodes for all reductions --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 38 ++- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll | 226 +++++++++++------ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll | 227 ++++++++++++------ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 226 +++++++++++------ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 226 +++++++++++------ 5 files changed, 632 insertions(+), 311 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5e377703b357e..8a5107c2b0d88 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5679,8 +5679,12 @@ static uint64_t getIdentityValueForWaveReduction(unsigned Opc) { static bool is16bitWaveReduction(unsigned Opc) { return Opc == AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U16 || + Opc == AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U16_t16 || Opc == AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I16 || + Opc == AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I16_t16 || Opc == AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U16 || + Opc == AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U16_t16 || + Opc == AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I16_t16 || Opc == AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I16; } @@ -5812,17 +5816,18 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, enum WAVE_REDUCE_STRATEGY : unsigned { DEFAULT = 0, ITERATIVE = 1, DPP = 2 }; MachineBasicBlock *RetBB = nullptr; unsigned MIOpc = MI.getOpcode(); - auto BuildRegSequence = [&](MachineBasicBlock &BB, - MachineBasicBlock::iterator MI, Register Dst, - Register Src0, Register Src1) { - auto RegSequence = - BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dst) - .addReg(Src0) - .addImm(AMDGPU::sub0) - .addReg(Src1) - .addImm(AMDGPU::sub1); - return RegSequence; - }; + auto BuildRegSequence = + [&](MachineBasicBlock &BB, MachineBasicBlock::iterator MI, Register Dst, + Register Src0, Register Src1, unsigned SubRegIdx0 = AMDGPU::sub0, + unsigned SubRegIdx1 = AMDGPU::sub1) { + auto RegSequence = + BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dst) + .addReg(Src0) + .addImm(SubRegIdx0) + .addReg(Src1) + .addImm(SubRegIdx1); + return RegSequence; + }; if (isSGPR) { switch (Opc) { case AMDGPU::S_MIN_U32: @@ -6058,7 +6063,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, bool isFPOp = isFloatingPointWaveReduceOperation(Opc); bool NeedsMovDPP = !is32BitOpc; bool needsSignExtension = + MI.getOpcode() == AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I16_t16 || MI.getOpcode() == AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I16 || + MI.getOpcode() == AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I16_t16 || MI.getOpcode() == AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I16; bool useRealTrue16 = ST.useRealTrue16Insts(); // Create virtual registers required for lowering. @@ -6107,6 +6114,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, // reduction. Register PromotedSrc = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (useRealTrue16) { + Register Hi16BitsReg = MRI.createVirtualRegister(SrcRegClass); + Register SuperRegTuple = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(BB, I, DL, TII->get(AMDGPU::IMPLICIT_DEF), Hi16BitsReg); + BuildRegSequence(BB, I, SuperRegTuple, SrcReg, Hi16BitsReg, + AMDGPU::lo16, AMDGPU::hi16); + SrcReg = SuperRegTuple; + } BuildMI(BB, I, DL, TII->get(needsSignExtension ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64), diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll index 5ecbbe86191c7..1db3b5e02a653 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll @@ -333,81 +333,157 @@ define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off ; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX1164DAGISEL-LABEL: divergent_value_i16: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec -; GFX1164DAGISEL-NEXT: s_brev_b32 s2, 1 -; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 -; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX1164DAGISEL-NEXT: s_max_i32 s2, s2, s4 -; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1164DAGISEL-NEXT: ; %bb.2: -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164DAGISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1164GISEL-LABEL: divergent_value_i16: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1164GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec -; GFX1164GISEL-NEXT: s_brev_b32 s2, 1 -; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3 -; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX1164GISEL-NEXT: s_max_i32 s2, s2, s4 -; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1164GISEL-NEXT: ; %bb.2: -; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164GISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1132DAGISEL-LABEL: divergent_value_i16: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX1132DAGISEL-NEXT: s_brev_b32 s0, 1 -; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2 -; GFX1132DAGISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX1132DAGISEL-NEXT: s_max_i32 s0, s0, s3 -; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1132DAGISEL-NEXT: ; %bb.2: -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132DAGISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1132GISEL-LABEL: divergent_value_i16: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1132GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX1132GISEL-NEXT: s_brev_b32 s0, 1 -; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2 -; GFX1132GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX1132GISEL-NEXT: s_max_i32 s0, s0, s3 -; GFX1132GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1132GISEL-NEXT: ; %bb.2: -; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132GISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-FAKE16-NEXT: s_brev_b32 s2, 1 +; GFX1164DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-FAKE16-NEXT: s_max_i32 s2, s2, s4 +; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.2: +; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1164GISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-FAKE16-NEXT: s_brev_b32 s2, 1 +; GFX1164GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-FAKE16-NEXT: s_max_i32 s2, s2, s4 +; GFX1164GISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-FAKE16-NEXT: ; %bb.2: +; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132DAGISEL-FAKE16-NEXT: s_brev_b32 s0, 1 +; GFX1132DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-FAKE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132DAGISEL-FAKE16-NEXT: s_max_i32 s0, s0, s3 +; GFX1132DAGISEL-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-FAKE16-NEXT: ; %bb.2: +; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-FAKE16-NEXT: s_brev_b32 s0, 1 +; GFX1132GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-FAKE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132GISEL-FAKE16-NEXT: s_max_i32 s0, s0, s3 +; GFX1132GISEL-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-FAKE16-NEXT: ; %bb.2: +; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-TRUE16-NEXT: s_brev_b32 s2, 1 +; GFX1164DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-TRUE16-NEXT: s_max_i32 s2, s2, s4 +; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.2: +; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1164GISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-TRUE16-NEXT: s_brev_b32 s2, 1 +; GFX1164GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-TRUE16-NEXT: s_max_i32 s2, s2, s4 +; GFX1164GISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-TRUE16-NEXT: ; %bb.2: +; GFX1164GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132DAGISEL-TRUE16-NEXT: s_brev_b32 s0, 1 +; GFX1132DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-TRUE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132DAGISEL-TRUE16-NEXT: s_max_i32 s0, s0, s3 +; GFX1132DAGISEL-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-TRUE16-NEXT: ; %bb.2: +; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-TRUE16-NEXT: s_brev_b32 s0, 1 +; GFX1132GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-TRUE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132GISEL-TRUE16-NEXT: s_max_i32 s0, s0, s3 +; GFX1132GISEL-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-TRUE16-NEXT: ; %bb.2: +; GFX1132GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] entry: %result = call i16 @llvm.amdgcn.wave.reduce.max.i16(i16 %in, i32 1) store i16 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll index 4c9a44c2edf18..3db78875f4016 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll @@ -15,6 +15,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-TRUE16 %s + define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX8DAGISEL-LABEL: uniform_value_i16: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -332,81 +333,157 @@ define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off ; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX1164DAGISEL-LABEL: divergent_value_i16: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec -; GFX1164DAGISEL-NEXT: s_brev_b32 s2, -2 -; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 -; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX1164DAGISEL-NEXT: s_min_i32 s2, s2, s4 -; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1164DAGISEL-NEXT: ; %bb.2: -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164DAGISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1164GISEL-LABEL: divergent_value_i16: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1164GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec -; GFX1164GISEL-NEXT: s_brev_b32 s2, -2 -; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3 -; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX1164GISEL-NEXT: s_min_i32 s2, s2, s4 -; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1164GISEL-NEXT: ; %bb.2: -; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164GISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1132DAGISEL-LABEL: divergent_value_i16: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX1132DAGISEL-NEXT: s_brev_b32 s0, -2 -; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2 -; GFX1132DAGISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX1132DAGISEL-NEXT: s_min_i32 s0, s0, s3 -; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1132DAGISEL-NEXT: ; %bb.2: -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132DAGISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1132GISEL-LABEL: divergent_value_i16: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1132GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX1132GISEL-NEXT: s_brev_b32 s0, -2 -; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2 -; GFX1132GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX1132GISEL-NEXT: s_min_i32 s0, s0, s3 -; GFX1132GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1132GISEL-NEXT: ; %bb.2: -; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132GISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-FAKE16-NEXT: s_brev_b32 s2, -2 +; GFX1164DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-FAKE16-NEXT: s_min_i32 s2, s2, s4 +; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.2: +; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1164GISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-FAKE16-NEXT: s_brev_b32 s2, -2 +; GFX1164GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-FAKE16-NEXT: s_min_i32 s2, s2, s4 +; GFX1164GISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-FAKE16-NEXT: ; %bb.2: +; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132DAGISEL-FAKE16-NEXT: s_brev_b32 s0, -2 +; GFX1132DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-FAKE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132DAGISEL-FAKE16-NEXT: s_min_i32 s0, s0, s3 +; GFX1132DAGISEL-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-FAKE16-NEXT: ; %bb.2: +; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-FAKE16-NEXT: s_brev_b32 s0, -2 +; GFX1132GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-FAKE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132GISEL-FAKE16-NEXT: s_min_i32 s0, s0, s3 +; GFX1132GISEL-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-FAKE16-NEXT: ; %bb.2: +; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-TRUE16-NEXT: s_brev_b32 s2, -2 +; GFX1164DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-TRUE16-NEXT: s_min_i32 s2, s2, s4 +; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.2: +; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1164GISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-TRUE16-NEXT: s_brev_b32 s2, -2 +; GFX1164GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-TRUE16-NEXT: s_min_i32 s2, s2, s4 +; GFX1164GISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-TRUE16-NEXT: ; %bb.2: +; GFX1164GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132DAGISEL-TRUE16-NEXT: s_brev_b32 s0, -2 +; GFX1132DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-TRUE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132DAGISEL-TRUE16-NEXT: s_min_i32 s0, s0, s3 +; GFX1132DAGISEL-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-TRUE16-NEXT: ; %bb.2: +; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-TRUE16-NEXT: s_brev_b32 s0, -2 +; GFX1132GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-TRUE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132GISEL-TRUE16-NEXT: s_min_i32 s0, s0, s3 +; GFX1132GISEL-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-TRUE16-NEXT: ; %bb.2: +; GFX1132GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] entry: %result = call i16 @llvm.amdgcn.wave.reduce.min.i16(i16 %in, i32 1) store i16 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index 5b49959869428..941ea62030cf9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -345,81 +345,157 @@ define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off ; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX1164DAGISEL-LABEL: divergent_value_i16: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec -; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0 -; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 -; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX1164DAGISEL-NEXT: s_max_u32 s2, s2, s4 -; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1164DAGISEL-NEXT: ; %bb.2: -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164DAGISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1164GISEL-LABEL: divergent_value_i16: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1164GISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec -; GFX1164GISEL-NEXT: s_mov_b32 s2, 0 -; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3 -; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX1164GISEL-NEXT: s_max_u32 s2, s2, s4 -; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1164GISEL-NEXT: ; %bb.2: -; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164GISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1132DAGISEL-LABEL: divergent_value_i16: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0 -; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2 -; GFX1132DAGISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX1132DAGISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1132DAGISEL-NEXT: ; %bb.2: -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132DAGISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1132GISEL-LABEL: divergent_value_i16: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1132GISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2 -; GFX1132GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX1132GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX1132GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1132GISEL-NEXT: ; %bb.2: -; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132GISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-FAKE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX1164DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-FAKE16-NEXT: s_max_u32 s2, s2, s4 +; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.2: +; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-FAKE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1164GISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX1164GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-FAKE16-NEXT: s_max_u32 s2, s2, s4 +; GFX1164GISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-FAKE16-NEXT: ; %bb.2: +; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-FAKE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1132DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-FAKE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132DAGISEL-FAKE16-NEXT: s_max_u32 s0, s0, s3 +; GFX1132DAGISEL-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-FAKE16-NEXT: ; %bb.2: +; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-FAKE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-FAKE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132GISEL-FAKE16-NEXT: s_max_u32 s0, s0, s3 +; GFX1132GISEL-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-FAKE16-NEXT: ; %bb.2: +; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-TRUE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX1164DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-TRUE16-NEXT: s_max_u32 s2, s2, s4 +; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.2: +; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-TRUE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1164GISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX1164GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-TRUE16-NEXT: s_max_u32 s2, s2, s4 +; GFX1164GISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-TRUE16-NEXT: ; %bb.2: +; GFX1164GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-TRUE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1132DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-TRUE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132DAGISEL-TRUE16-NEXT: s_max_u32 s0, s0, s3 +; GFX1132DAGISEL-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-TRUE16-NEXT: ; %bb.2: +; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-TRUE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-TRUE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132GISEL-TRUE16-NEXT: s_max_u32 s0, s0, s3 +; GFX1132GISEL-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-TRUE16-NEXT: ; %bb.2: +; GFX1132GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] entry: %result = call i16 @llvm.amdgcn.wave.reduce.umax.i16(i16 %in, i32 1) store i16 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 4a9fde954573c..aec64a15b2782 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -333,81 +333,157 @@ define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) { ; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off ; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX1164DAGISEL-LABEL: divergent_value_i16: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec -; GFX1164DAGISEL-NEXT: s_mov_b32 s2, -1 -; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 -; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX1164DAGISEL-NEXT: s_min_u32 s2, s2, s4 -; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1164DAGISEL-NEXT: ; %bb.2: -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164DAGISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1164GISEL-LABEL: divergent_value_i16: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1164GISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec -; GFX1164GISEL-NEXT: s_mov_b32 s2, -1 -; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3 -; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX1164GISEL-NEXT: s_min_u32 s2, s2, s4 -; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1164GISEL-NEXT: ; %bb.2: -; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164GISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1132DAGISEL-LABEL: divergent_value_i16: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX1132DAGISEL-NEXT: s_mov_b32 s0, -1 -; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2 -; GFX1132DAGISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX1132DAGISEL-NEXT: s_min_u32 s0, s0, s3 -; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1132DAGISEL-NEXT: ; %bb.2: -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132DAGISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1132GISEL-LABEL: divergent_value_i16: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1132GISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX1132GISEL-NEXT: s_mov_b32 s0, -1 -; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2 -; GFX1132GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX1132GISEL-NEXT: s_min_u32 s0, s0, s3 -; GFX1132GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1132GISEL-NEXT: ; %bb.2: -; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132GISEL-NEXT: global_store_b16 v[0:1], v2, off -; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-FAKE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX1164DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-FAKE16-NEXT: s_min_u32 s2, s2, s4 +; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.2: +; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1164GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-FAKE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1164GISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX1164GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-FAKE16-NEXT: s_min_u32 s2, s2, s4 +; GFX1164GISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-FAKE16-NEXT: ; %bb.2: +; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-FAKE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s0, -1 +; GFX1132DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-FAKE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132DAGISEL-FAKE16-NEXT: s_min_u32 s0, s0, s3 +; GFX1132DAGISEL-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-FAKE16-NEXT: ; %bb.2: +; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-FAKE16-LABEL: divergent_value_i16: +; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1132GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-FAKE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s0, -1 +; GFX1132GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-FAKE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132GISEL-FAKE16-NEXT: s_min_u32 s0, s0, s3 +; GFX1132GISEL-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-FAKE16-NEXT: ; %bb.2: +; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-TRUE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX1164DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-TRUE16-NEXT: s_min_u32 s2, s2, s4 +; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.2: +; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1164GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-TRUE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1164GISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX1164GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-TRUE16-NEXT: s_min_u32 s2, s2, s4 +; GFX1164GISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-TRUE16-NEXT: ; %bb.2: +; GFX1164GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-TRUE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s0, -1 +; GFX1132DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-TRUE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132DAGISEL-TRUE16-NEXT: s_min_u32 s0, s0, s3 +; GFX1132DAGISEL-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-TRUE16-NEXT: ; %bb.2: +; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-TRUE16-LABEL: divergent_value_i16: +; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1132GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-TRUE16-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s0, -1 +; GFX1132GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-TRUE16-NEXT: s_bitset0_b32 s1, s2 +; GFX1132GISEL-TRUE16-NEXT: s_min_u32 s0, s0, s3 +; GFX1132GISEL-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-TRUE16-NEXT: ; %bb.2: +; GFX1132GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] entry: %result = call i16 @llvm.amdgcn.wave.reduce.umin.i16(i16 %in, i32 1) store i16 %result, ptr addrspace(1) %out _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
