================
@@ -892,6 +892,79 @@ bool MFMASmallGemmOpt::applyIGLPStrategy(
return true;
}
+/// Whether \p MI matches \c SchedGroupMask::VALU classification (e.g. barrier
+/// mask \c 0x2)
+static bool matchesSchedGroupValu(const MachineInstr &MI,
+ const SIInstrInfo *TII) {
+ if (MI.isMetaInstruction())
+ return false;
+ // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
+ // For our purposes, these shall not be classified as VALU as this results
+ // in unexpected behavior.
+ return TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI) &&
+ !TII->isLDSDMA(MI) && !MI.mayLoadOrStore();
+}
+
+/// Interleave MFMA/WMMA with VALU slots: each repeating stage is one MFMA (or
+/// WMMA), then up to N VALU ops per gap where N = floor(#VALU / #MFMA) in this
+/// schedule region (same predicate as \c matchesSchedGroupValu), at least 1.
+/// Template length uses MFMACount * 3 for slack, like MFMASmallGemmOpt.
+/// \p IsBottomUp is false so SchedGroup pipeline order matches forward program
+/// order (MFMA before its VALU gap).
+class MFMAValuSpacingOpt final : public IGLPStrategy {
+public:
+ bool applyIGLPStrategy(
+ DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ AMDGPU::SchedulingPhase Phase) override;
+
+ bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+ AMDGPU::SchedulingPhase Phase) override {
+ for (const MachineInstr &I : *DAG)
+ if (TII->isMFMAorWMMA(I))
+ return true;
+ return false;
+ }
+
+ MFMAValuSpacingOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+ : IGLPStrategy(DAG, TII) {
+ IsBottomUp = false;
+ }
+};
+
+bool MFMAValuSpacingOpt::applyIGLPStrategy(
+ DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ AMDGPU::SchedulingPhase Phase) {
+ unsigned MFMACount = 0;
+ unsigned ValuCount = 0;
+ for (const MachineInstr &I : *DAG) {
+ if (TII->isMFMAorWMMA(I))
+ ++MFMACount;
+ else if (matchesSchedGroupValu(I, TII))
+ ++ValuCount;
+ }
+
+ unsigned ValuGap = 1;
+ if (MFMACount > 0 && ValuCount > MFMACount) {
+ ValuGap = ValuCount / MFMACount;
+ }
+
+ const unsigned PipelineSyncID = 0;
+ SchedGroup *SG = nullptr;
+ for (unsigned I = 0; I < MFMACount * 3; ++I) {
----------------
hidekisaito wrote:
I left it as is. What's the good amount of slack? Is "+ 2" better than "* 3"? I
just do not want to be too strict, in case something isn't quite right but
close enough.
https://github.com/llvm/llvm-project/pull/190916
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits