================
@@ -2904,6 +2905,68 @@ bool SIInsertWaitcnts::applyDSLoopWaitOpt(MachineInstr 
&MI,
   return true;
 }
 
+// Insert DS_CNT flush in preheaders of loops where DS wait relaxation was
+// applied. This is necessary because the relaxed wait counts inside the loop
+// are computed based on the DS loads issued at the end of the previous
+// iteration (via backedge), but the first iteration enters via the preheader.
+// We must ensure all DS loads from the preheader are complete before entering
+// the loop.
+bool SIInsertWaitcnts::insertDSPreheaderFlushes(MachineFunction &MF) {
+  bool Modified = false;
+
+  for (auto &[LoopHeader, Info] : LoopDSWaitOptCache) {
+    if (!Info.Valid || !Info.RelaxationApplied)
+      continue;
+
+    MachineLoop *ML = MLI->getLoopFor(LoopHeader);
+    if (!ML)
+      continue;
+
+    MachineBasicBlock *Preheader = ML->getLoopPreheader();
+    if (!Preheader)
+      continue;
+
+    // Insert s_wait_dscnt 0 at the end of the preheader (before the 
terminator)
+    MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
+    if (InsertPos == Preheader->end() && !Preheader->empty())
+      InsertPos = std::prev(Preheader->end());
+
+    // Check if there's already a DS wait at this position
+    bool NeedInsert = true;
+    if (InsertPos != Preheader->end() && InsertPos != Preheader->begin()) {
+      auto CheckPos = std::prev(InsertPos);
+      if (CheckPos->getOpcode() == AMDGPU::S_WAIT_DSCNT_soft ||
+          CheckPos->getOpcode() == AMDGPU::S_WAIT_DSCNT) {
+        if (CheckPos->getOperand(0).getImm() == 0)
+          NeedInsert = false;
+        else {
+          // Change existing wait to 0
+          CheckPos->getOperand(0).setImm(0);
+          NeedInsert = false;
+          Modified = true;
+          LLVM_DEBUG(dbgs() << "DS Loop Opt: Changed existing DS_CNT wait to 0"
+                            << " in preheader ";
+                     Preheader->printName(dbgs()); dbgs() << "\n");
+        }
+      }
+    }
+
+    if (NeedInsert) {
+      DebugLoc DL;
+      if (InsertPos != Preheader->end())
+        DL = InsertPos->getDebugLoc();
+      BuildMI(*Preheader, InsertPos, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft))
----------------
Pierre-vh wrote:

Don't add _soft waitcnts after this pass is done. This pass is supposed to 
eliminate them. Add normal waits

https://github.com/llvm/llvm-project/pull/171948
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to