llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Lucas Ramirez (lucas-rami) <details> <summary>Changes</summary> When scheduling must be reverted for a region, the current implementation re-orders non-debug instructions and debug instructions separately; the former in a first pass and the latter in a second pass handled by a generic machine scheduler helper whose state is tied to the current region being scheduled, in turns limiting the revert logic to only work on the active scheduling region. This makes the revert logic work in a single pass for all MIs, and removes the restriction that it works exclusively on the active scheduling region. The latter enables future use cases such as reverting scheduling of multiple regions at once. While the instruction order produced should be identical to what it was before, small changes in slot indices of re-scheduled MIs yield different RA decisions and significant test churn. --- Patch is 16.02 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/177203.diff 38 Files Affected: - (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (+42-50) - (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.h (+6-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+138-138) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+88-88) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll (+66-66) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (+69-69) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+67943-68201) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+78-79) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+10830-10928) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+156-144) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+641-609) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+1356-1313) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+2096-2070) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+3477-3552) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+5673-5697) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+4456-4503) - (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+1122-1122) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll (+35-35) - (modified) llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+141-141) - (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+95-94) - (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+43-36) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir (+689-694) - (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+18-18) - (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+443-476) - (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+236-237) - (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+379-378) - (modified) llvm/test/CodeGen/AMDGPU/load-local-i16.ll (+961-942) - (modified) llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll (+192-192) - (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+2374-2391) - (modified) llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll (+192-192) - (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+42-42) - (modified) llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll (+291-269) - (modified) llvm/test/CodeGen/AMDGPU/select.f16.ll (+124-122) - (modified) llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/spill-vgpr.ll (+7-7) - (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+101-101) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 876adddcfbbaa..f656fb56e9e74 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1542,10 +1542,12 @@ void GCNSchedStage::checkScheduling() { // Revert if this region's schedule would cause a drop in occupancy or // spilling. - if (shouldRevertScheduling(WavesAfter)) - revertScheduling(); - else + if (shouldRevertScheduling(WavesAfter)) { + modifyRegionSchedule(RegionIdx, DAG.BB, Unsched); + std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx]; + } else { DAG.Pressure[RegionIdx] = PressureAfter; + } } unsigned @@ -1773,66 +1775,56 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { return false; } -void GCNSchedStage::revertScheduling() { - LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); - DAG.RegionEnd = DAG.RegionBegin; - int SkippedDebugInstr = 0; - for (MachineInstr *MI : Unsched) { +void GCNSchedStage::modifyRegionSchedule(unsigned RegionIdx, + MachineBasicBlock *MBB, + ArrayRef<MachineInstr *> MIOrder) { + assert(std::distance(DAG.Regions[RegionIdx].first, + DAG.Regions[RegionIdx].second) == + static_cast<long>(MIOrder.size()) && + "instruction number mismatch"); + if (MIOrder.empty()) + return; + + LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n'); + + // Reconstruct MI sequence by moving instructions in desired order before + // the current region's start. + MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first; + for (MachineInstr *MI : MIOrder) { + // Either move the next MI in order before the end of the region or move the + // region end past the MI if it is at the correct position. + if (MI->getIterator() != RegionEnd) + MBB->splice(RegionEnd, MBB, MI); + else + ++RegionEnd; + if (MI->isDebugInstr()) { - ++SkippedDebugInstr; + LLVM_DEBUG(dbgs() << "Scheduling " << *MI); continue; } - if (MI->getIterator() != DAG.RegionEnd) { - DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI); - if (!MI->isDebugInstr()) - DAG.LIS->handleMove(*MI, true); - } + DAG.LIS->handleMove(*MI, true); // Reset read-undef flags and update them later. - for (auto &Op : MI->all_defs()) + for (MachineOperand &Op : MI->all_defs()) Op.setIsUndef(false); RegisterOperands RegOpers; RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); - if (!MI->isDebugInstr()) { - if (DAG.ShouldTrackLaneMasks) { - // Adjust liveness and add missing dead+read-undef flags. - SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); - RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); - } else { - // Adjust for missing dead-def flags. - RegOpers.detectDeadDefs(*MI, *DAG.LIS); - } + if (DAG.ShouldTrackLaneMasks) { + // Adjust liveness and add missing dead+read-undef flags. + SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); + } else { + // Adjust for missing dead-def flags. + RegOpers.detectDeadDefs(*MI, *DAG.LIS); } - DAG.RegionEnd = MI->getIterator(); - ++DAG.RegionEnd; LLVM_DEBUG(dbgs() << "Scheduling " << *MI); } - // After reverting schedule, debug instrs will now be at the end of the block - // and RegionEnd will point to the first debug instr. Increment RegionEnd - // pass debug instrs to the actual end of the scheduling region. - while (SkippedDebugInstr-- > 0) - ++DAG.RegionEnd; - - // If Unsched.front() instruction is a debug instruction, this will actually - // shrink the region since we moved all debug instructions to the end of the - // block. Find the first instruction that is not a debug instruction. - DAG.RegionBegin = Unsched.front()->getIterator(); - if (DAG.RegionBegin->isDebugInstr()) { - for (MachineInstr *MI : Unsched) { - if (MI->isDebugInstr()) - continue; - DAG.RegionBegin = MI->getIterator(); - break; - } - } - - // Then move the debug instructions back into their correct place and set - // RegionBegin and RegionEnd if needed. - DAG.placeDebugValues(); - - DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); + // The region end doesn't change throughout scheduling since it itself is + // outside the region (whether that is a MBB end or a terminator MI). + assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch"); + DAG.Regions[RegionIdx].first = MIOrder.front(); } bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 6563c8d050ff9..a4485621f48bf 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -397,8 +397,12 @@ class GCNSchedStage { // Returns true if the new schedule may result in more spilling. bool mayCauseSpilling(unsigned WavesAfter); - // Attempt to revert scheduling for this region. - void revertScheduling(); + /// Sets the schedule of region \p RegionIdx in block \p MBB to \p MIOrder. + /// The MIs in \p MIOrder must be exactly the same as the ones currently + /// existing inside the region, only in a different order that honors def-use + /// chains. + void modifyRegionSchedule(unsigned RegionIdx, MachineBasicBlock *MBB, + ArrayRef<MachineInstr *> MIOrder); void advanceRegion() { RegionIdx++; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 54efb26ae1e01..541e0bbf6aa8f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -7376,183 +7376,183 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-LABEL: v_fshl_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16 -; GFX6-NEXT: v_not_b32_e32 v18, 63 -; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 64, v19 -; GFX6-NEXT: v_add_i32_e32 v27, vcc, v19, v18 -; GFX6-NEXT: v_lshr_b64 v[23:24], v[0:1], v23 -; GFX6-NEXT: v_lshl_b64 v[25:26], v[2:3], v19 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], v19 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v27 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX6-NEXT: v_or_b32_e32 v19, v23, v25 -; GFX6-NEXT: v_or_b32_e32 v23, v24, v26 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 +; GFX6-NEXT: v_not_b32_e32 v25, 63 +; GFX6-NEXT: v_add_i32_e32 v26, vcc, v23, v25 +; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 64, v23 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX6-NEXT: v_lshl_b64 v[17:18], v[0:1], v23 +; GFX6-NEXT: v_lshr_b64 v[21:22], v[0:1], v21 +; GFX6-NEXT: v_lshl_b64 v[23:24], v[2:3], v23 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v26 +; GFX6-NEXT: v_or_b32_e32 v21, v21, v23 +; GFX6-NEXT: v_or_b32_e32 v22, v22, v24 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v21, v0, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v22, v1, v3, s[4:5] ; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1 -; GFX6-NEXT: v_mov_b32_e32 v17, 0x7f +; GFX6-NEXT: v_mov_b32_e32 v19, 0x7f ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1 -; GFX6-NEXT: v_bfi_b32 v10, v16, 0, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v24, 0, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v22, vcc -; GFX6-NEXT: v_add_i32_e32 v16, vcc, v10, v18 -; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 64, v10 +; GFX6-NEXT: v_bfi_b32 v10, v16, 0, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v23, 0, v17, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v18, vcc +; GFX6-NEXT: v_add_i32_e32 v24, vcc, v10, v25 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v10 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v10 ; GFX6-NEXT: v_lshr_b64 v[10:11], v[0:1], v10 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v21 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v16 -; GFX6-NEXT: v_or_b32_e32 v10, v10, v21 -; GFX6-NEXT: v_or_b32_e32 v11, v11, v22 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX6-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v24 +; GFX6-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX6-NEXT: v_or_b32_e32 v10, v10, v16 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v1, v18, v1 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v16, v25 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v16 -; GFX6-NEXT: v_lshr_b64 v[10:11], v[4:5], v10 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[6:7], v16 -; GFX6-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v16, v18 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v16 -; GFX6-NEXT: v_or_b32_e32 v16, v10, v21 -; GFX6-NEXT: v_or_b32_e32 v21, v11, v22 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[4:5], v19 -; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v8, v10, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v11, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] -; GFX6-NEXT: v_lshr_b64 v[8:9], v[12:13], 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v10, 31, v14 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX6-NEXT: v_lshr_b64 v[10:11], v[14:15], 1 -; GFX6-NEXT: v_bfi_b32 v14, v20, 0, v17 -; GFX6-NEXT: v_add_i32_e32 v18, vcc, v14, v18 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v14 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 -; GFX6-NEXT: v_lshr_b64 v[12:13], v[10:11], v14 -; GFX6-NEXT: v_lshr_b64 v[14:15], v[8:9], v14 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 -; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], v18 -; GFX6-NEXT: v_or_b32_e32 v14, v14, v16 -; GFX6-NEXT: v_or_b32_e32 v15, v15, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v24, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v25, v1 -; GFX6-NEXT: v_or_b32_e32 v3, v23, v3 -; GFX6-NEXT: v_or_b32_e32 v4, v4, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX6-NEXT: v_or_b32_e32 v6, v6, v10 -; GFX6-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[4:5], v10 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[6:7], v16 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v18 +; GFX6-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX6-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX6-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v18, v4, v6, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v21, v5, v7, s[4:5] +; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1 +; GFX6-NEXT: v_bfi_b32 v10, v20, 0, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v9, vcc +; GFX6-NEXT: v_add_i32_e32 v14, vcc, v10, v25 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 64, v10 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v10 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[4:5], v10 +; GFX6-NEXT: v_lshl_b64 v[12:13], v[6:7], v12 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v14 +; GFX6-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX6-NEXT: v_or_b32_e32 v11, v11, v13 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX6-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v17, v5 +; GFX6-NEXT: v_or_b32_e32 v6, v18, v6 +; GFX6-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16 -; GFX8-NEXT: v_not_b32_e32 v18, 63 -; GFX8-NEXT: v_sub_u32_e32 v23, vcc, 64, v19 -; GFX8-NEXT: v_add_u32_e32 v27, vcc, v19, v18 -; GFX8-NEXT: v_lshrrev_b64 v[23:24], v23, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[25:26], v19, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v27, v[0:1] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX8-NEXT: v_or_b32_e32 v19, v23, v25 -; GFX8-NEXT: v_or_b32_e32 v23, v24, v26 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 +; GFX8-NEXT: v_not_b32_e32 v25, 63 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v23, v25 +; GFX8-NEXT: v_sub_u32_e32 v21, vcc, 64, v23 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX8-NEXT: v_lshlrev_b64 v[17:18], v23, v[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[21:22], v21, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[23:24], v23, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v26, v[0:1] +; GFX8-NEXT: v_or_b32_e32 v21, v21, v23 +; GFX8-NEXT: v_or_b32_e32 v22, v22, v24 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v21, v0, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v1, v3, s[4:5] ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] -; GFX8-NEXT: v_mov_b32_e32 v17, 0x7f +; GFX8-NEXT: v_mov_b32_e32 v19, 0x7f ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] -; GFX8-NEXT: v_bfi_b32 v10, v16, 0, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v24, 0, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v22, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v10, v18 -; GFX8-NEXT: v_sub_u32_e32 v21, vcc, 64, v10 +; GFX8-NEXT: v_bfi_b32 v10, v16, 0, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v23, 0, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v24, vcc, v10, v25 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v10 ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v10, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v21, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v16, v[2:3] -; GFX8-NEXT: v_or_b32_e32 v10, v10, v21 -; GFX8-NEXT: v_or_b32_e32 v11, v11, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX8-NEXT: v_or_b32_e32 v10, v10, v16 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX8-NEXT: v_or_b32_e32 v1, v18, v1 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v16, v25 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v16 -; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v16, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v16, v18 ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] -; GFX8-NEXT: v_or_b32_e32 v16, v10, v21 -; GFX8-NEXT: v_or_b32_e32 v21, v11, v22 -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v19, v[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 31, v14 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[14:15] -; GFX8-NEXT: v_bfi_b32 v14, v20, 0, v17 -; GFX8-NEXT: v_add_u32_e32 v18, vcc, v14, v18 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v14 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 -; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[14:15], v14, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[10:11], v18, v[10:11] -; GFX8-NEXT: v_or_b32_e32 v14, v14, v16 -; GFX8-NEXT: v_or_b32_e32 v15, v15, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v24, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v25, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v23, v3 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v8 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v10 -; GFX8-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v18, v[4:5] +; GFX8-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX8-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX8-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v18, v4, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v5, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] +; GFX8-NEXT: v_bfi_b32 v10, v20, 0, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v9, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vc... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/177203 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
