[AMDGPU] New AMDGPUInsertDelayAlu pass

Differential Revision: https://reviews.llvm.org/D128270
2022-06-21 11:46:28 +01:00 · 2022-06-21 11:46:28 +01:00 · cfb7ffdec0
parent 8d29f0fdb9
commit cfb7ffdec0
39 changed files with 1442 additions and 15 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@ -299,6 +299,9 @@ extern char &SIMemoryLegalizerID;
 void initializeSIModeRegisterPass(PassRegistry&);
 extern char &SIModeRegisterID;

+void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
+extern char &AMDGPUInsertDelayAluID;
+
 void initializeSIInsertHardClausesPass(PassRegistry &);
 extern char &SIInsertHardClausesID;

--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@ -0,0 +1,457 @@
+//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SetVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-insert-delay-alu"
+
+namespace {
+
+class AMDGPUInsertDelayAlu : public MachineFunctionPass {
+public:
+  static char ID;
+
+  const SIInstrInfo *SII;
+  const TargetRegisterInfo *TRI;
+
+  TargetSchedModel SchedModel;
+
+  AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  // Return true if MI waits for all outstanding VALU instructions to complete.
+  static bool instructionWaitsForVALU(const MachineInstr &MI) {
+    // These instruction types wait for VA_VDST==0 before issuing.
+    const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
+                               SIInstrFlags::FLAT | SIInstrFlags::MIMG |
+                               SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
+    if (MI.getDesc().TSFlags & VA_VDST_0)
+      return true;
+    if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
+        MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
+      return true;
+    if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+        (MI.getOperand(0).getImm() & 0xf000) == 0)
+      return true;
+    return false;
+  }
+
+  // Types of delay that can be encoded in an s_delay_alu instruction.
+  enum DelayType { VALU, TRANS, SALU, OTHER };
+
+  // Get the delay type for an instruction with the specified TSFlags.
+  static DelayType getDelayType(uint64_t TSFlags) {
+    if (TSFlags & SIInstrFlags::TRANS)
+      return TRANS;
+    if (TSFlags & SIInstrFlags::VALU)
+      return VALU;
+    if (TSFlags & SIInstrFlags::SALU)
+      return SALU;
+    return OTHER;
+  }
+
+  // Information about the last instruction(s) that wrote to a particular
+  // regunit. In straight-line code there will only be one such instruction, but
+  // when control flow converges we merge the delay information from each path
+  // to represent the union of the worst-case delays of each type.
+  struct DelayInfo {
+    // One larger than the maximum number of (non-TRANS) VALU instructions we
+    // can encode in an s_delay_alu instruction.
+    static const unsigned VALU_MAX = 5;
+
+    // One larger than the maximum number of TRANS instructions we can encode in
+    // an s_delay_alu instruction.
+    static const unsigned TRANS_MAX = 4;
+
+    // If it was written by a (non-TRANS) VALU, remember how many clock cycles
+    // are left until it completes, and how many other (non-TRANS) VALU we have
+    // seen since it was issued.
+    uint8_t VALUCycles = 0;
+    uint8_t VALUNum = VALU_MAX;
+
+    // If it was written by a TRANS, remember how many clock cycles are left
+    // until it completes, and how many other TRANS we have seen since it was
+    // issued.
+    uint8_t TRANSCycles = 0;
+    uint8_t TRANSNum = TRANS_MAX;
+    // Also remember how many other (non-TRANS) VALU we have seen since it was
+    // issued. When an instruction depends on both a prior TRANS and a prior
+    // non-TRANS VALU, this is used to decide whether to encode a wait for just
+    // one or both of them.
+    uint8_t TRANSNumVALU = VALU_MAX;
+
+    // If it was written by an SALU, remember how many clock cycles are left
+    // until it completes.
+    uint8_t SALUCycles = 0;
+
+    DelayInfo() = default;
+
+    DelayInfo(DelayType Type, unsigned Cycles) {
+      switch (Type) {
+      default:
+        llvm_unreachable("unexpected type");
+      case VALU:
+        VALUCycles = Cycles;
+        VALUNum = 0;
+        break;
+      case TRANS:
+        TRANSCycles = Cycles;
+        TRANSNum = 0;
+        TRANSNumVALU = 0;
+        break;
+      case SALU:
+        SALUCycles = Cycles;
+        break;
+      }
+    }
+
+    bool operator==(const DelayInfo &RHS) const {
+      return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
+             TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
+             TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
+    }
+
+    bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
+
+    // Merge another DelayInfo into this one, to represent the union of the
+    // worst-case delays of each type.
+    void merge(const DelayInfo &RHS) {
+      VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
+      VALUNum = std::min(VALUNum, RHS.VALUNum);
+      TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
+      TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
+      TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
+      SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
+    }
+
+    // Update this DelayInfo after issuing an instruction. IsVALU should be 1
+    // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
+    // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
+    // instruction.  Return true if there is no longer any useful delay info.
+    bool advance(DelayType Type, unsigned Cycles) {
+      bool Erase = true;
+
+      VALUNum += (Type == VALU);
+      if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
+        // Forget about the VALU instruction. It was too far back or has
+        // definitely completed by now.
+        VALUNum = VALU_MAX;
+        VALUCycles = 0;
+      } else {
+        VALUCycles -= Cycles;
+        Erase = false;
+      }
+
+      TRANSNum += (Type == TRANS);
+      TRANSNumVALU += (Type == VALU);
+      if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
+        // Forget about any TRANS instruction. It was too far back or has
+        // definitely completed by now.
+        TRANSNum = TRANS_MAX;
+        TRANSNumVALU = VALU_MAX;
+        TRANSCycles = 0;
+      } else {
+        TRANSCycles -= Cycles;
+        Erase = false;
+      }
+
+      if (SALUCycles <= Cycles) {
+        // Forget about any SALU instruction. It has definitely completed by
+        // now.
+        SALUCycles = 0;
+      } else {
+        SALUCycles -= Cycles;
+        Erase = false;
+      }
+
+      return Erase;
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump() const {
+      if (VALUCycles)
+        dbgs() << " VALUCycles=" << (int)VALUCycles;
+      if (VALUNum < VALU_MAX)
+        dbgs() << " VALUNum=" << (int)VALUNum;
+      if (TRANSCycles)
+        dbgs() << " TRANSCycles=" << (int)TRANSCycles;
+      if (TRANSNum < TRANS_MAX)
+        dbgs() << " TRANSNum=" << (int)TRANSNum;
+      if (TRANSNumVALU < VALU_MAX)
+        dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
+      if (SALUCycles)
+        dbgs() << " SALUCycles=" << (int)SALUCycles;
+    }
+#endif
+  };
+
+  // A map from regunits to the delay info for that regunit.
+  struct DelayState : DenseMap<unsigned, DelayInfo> {
+    // Merge another DelayState into this one by merging the delay info for each
+    // regunit.
+    void merge(const DelayState &RHS) {
+      for (const auto &KV : RHS) {
+        iterator It;
+        bool Inserted;
+        std::tie(It, Inserted) = insert(KV);
+        if (!Inserted)
+          It->second.merge(KV.second);
+      }
+    }
+
+    // Advance the delay info for each regunit, erasing any that are no longer
+    // useful.
+    void advance(DelayType Type, unsigned Cycles) {
+      iterator Next;
+      for (auto I = begin(), E = end(); I != E; I = Next) {
+        Next = std::next(I);
+        if (I->second.advance(Type, Cycles))
+          erase(I);
+      }
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump(const TargetRegisterInfo *TRI) const {
+      if (empty()) {
+        dbgs() << "    empty\n";
+        return;
+      }
+
+      // Dump DelayInfo for each RegUnit in numerical order.
+      SmallVector<const_iterator, 8> Order;
+      Order.reserve(size());
+      for (const_iterator I = begin(), E = end(); I != E; ++I)
+        Order.push_back(I);
+      llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
+        return A->first < B->first;
+      });
+      for (const_iterator I : Order) {
+        dbgs() << "    " << printRegUnit(I->first, TRI);
+        I->second.dump();
+        dbgs() << "\n";
+      }
+    }
+#endif
+  };
+
+  // The saved delay state at the end of each basic block.
+  DenseMap<MachineBasicBlock *, DelayState> BlockState;
+
+  // Emit an s_delay_alu instruction if necessary before MI.
+  MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
+                             MachineInstr *LastDelayAlu) {
+    unsigned Imm = 0;
+
+    // Wait for a TRANS instruction.
+    if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
+      Imm |= 4 + Delay.TRANSNum;
+
+    // Wait for a VALU instruction (if it's more recent than any TRANS
+    // instruction that we're also waiting for).
+    if (Delay.VALUNum < DelayInfo::VALU_MAX &&
+        Delay.VALUNum <= Delay.TRANSNumVALU) {
+      if (Imm & 0xf)
+        Imm |= Delay.VALUNum << 7;
+      else
+        Imm |= Delay.VALUNum;
+    }
+
+    // Wait for an SALU instruction.
+    if (Delay.SALUCycles) {
+      if (Imm & 0x780) {
+        // We have already encoded a VALU and a TRANS delay. There's no room in
+        // the encoding for an SALU delay as well, so just drop it.
+      } else if (Imm & 0xf) {
+        Imm |= (Delay.SALUCycles + 8) << 7;
+      } else {
+        Imm |= Delay.SALUCycles + 8;
+      }
+    }
+
+    // Don't emit the s_delay_alu instruction if there's nothing to wait for.
+    if (!Imm)
+      return LastDelayAlu;
+
+    // If we only need to wait for one instruction, try encoding it in the last
+    // s_delay_alu that we emitted.
+    if (!(Imm & 0x780) && LastDelayAlu) {
+      unsigned Skip = 0;
+      for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
+                E = MachineBasicBlock::instr_iterator(MI);
+           ++I != E;) {
+        if (!I->isBundle() && !I->isMetaInstruction())
+          ++Skip;
+      }
+      if (Skip < 6) {
+        MachineOperand &Op = LastDelayAlu->getOperand(0);
+        unsigned LastImm = Op.getImm();
+        assert((LastImm & ~0xf) == 0 &&
+               "Remembered an s_delay_alu with no room for another delay!");
+        LastImm |= Imm << 7 | Skip << 4;
+        Op.setImm(LastImm);
+        return nullptr;
+      }
+    }
+
+    auto &MBB = *MI.getParent();
+    MachineInstr *DelayAlu =
+        BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
+    // Remember the s_delay_alu for next time if there is still room in it to
+    // encode another delay.
+    return (Imm & 0x780) ? nullptr : DelayAlu;
+  }
+
+  bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
+    DelayState State;
+    for (auto *Pred : MBB.predecessors())
+      State.merge(BlockState[Pred]);
+
+    LLVM_DEBUG(dbgs() << "  State at start of " << printMBBReference(MBB)
+                      << "\n";
+               State.dump(TRI););
+
+    bool Changed = false;
+    MachineInstr *LastDelayAlu = nullptr;
+
+    // Iterate over the contents of bundles, but don't emit any instructions
+    // inside a bundle.
+    for (auto &MI : MBB.instrs()) {
+      if (MI.isBundle() || MI.isMetaInstruction())
+        continue;
+
+      // Ignore some more instructions that do not generate any code.
+      switch (MI.getOpcode()) {
+      case AMDGPU::SI_RETURN_TO_EPILOG:
+        continue;
+      }
+
+      DelayType Type = getDelayType(MI.getDesc().TSFlags);
+
+      if (instructionWaitsForVALU(MI)) {
+        // Forget about all outstanding VALU delays.
+        State = DelayState();
+      } else if (Type != OTHER) {
+        DelayInfo Delay;
+        // TODO: Scan implicit uses too?
+        for (const auto &Op : MI.explicit_uses()) {
+          if (Op.isReg()) {
+            // One of the operands of the writelane is also the output operand.
+            // This creates the insertion of redundant delays. Hence, we have to
+            // ignore this operand.
+            if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
+              continue;
+            for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
+              auto It = State.find(*UI);
+              if (It != State.end()) {
+                Delay.merge(It->second);
+                State.erase(*UI);
+              }
+            }
+          }
+        }
+        if (Emit && !MI.isBundledWithPred()) {
+          // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
+          // just ignore them?
+          LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
+        }
+      }
+
+      if (Type != OTHER) {
+        // TODO: Scan implicit defs too?
+        for (const auto &Op : MI.defs()) {
+          unsigned Latency = SchedModel.computeOperandLatency(
+              &MI, MI.getOperandNo(&Op), nullptr, 0);
+          for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
+            State[*UI] = DelayInfo(Type, Latency);
+        }
+      }
+
+      // Advance by the number of cycles it takes to issue this instruction.
+      // TODO: Use a more advanced model that accounts for instructions that
+      // take multiple cycles to issue on a particular pipeline.
+      unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
+      // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
+      // instructions on the assumption that they will usually have to be issued
+      // twice?
+      State.advance(Type, Cycles);
+
+      LLVM_DEBUG(dbgs() << "  State after " << MI; State.dump(TRI););
+    }
+
+    if (Emit) {
+      assert(State == BlockState[&MBB] &&
+             "Basic block state should not have changed on final pass!");
+    } else if (State != BlockState[&MBB]) {
+      BlockState[&MBB] = std::move(State);
+      Changed = true;
+    }
+    return Changed;
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
+                      << "\n");
+
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    if (!ST.hasDelayAlu())
+      return false;
+
+    SII = ST.getInstrInfo();
+    TRI = ST.getRegisterInfo();
+
+    SchedModel.init(&ST);
+
+    // Calculate the delay state for each basic block, iterating until we reach
+    // a fixed point.
+    SetVector<MachineBasicBlock *> WorkList;
+    for (auto &MBB : reverse(MF))
+      WorkList.insert(&MBB);
+    while (!WorkList.empty()) {
+      auto &MBB = *WorkList.pop_back_val();
+      bool Changed = runOnMachineBasicBlock(MBB, false);
+      if (Changed)
+        WorkList.insert(MBB.succ_begin(), MBB.succ_end());
+    }
+
+    LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
+
+    // Make one last pass over all basic blocks to emit s_delay_alu
+    // instructions.
+    bool Changed = false;
+    for (auto &MBB : MF)
+      Changed |= runOnMachineBasicBlock(MBB, true);
+    return Changed;
+  }
+};
+
+} // namespace
+
+char AMDGPUInsertDelayAlu::ID = 0;
+
+char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
+
+INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
+                false, false)
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -272,6 +272,12 @@ static cl::opt<bool> EnableSIModeRegisterPass(
  cl::init(true),
  cl::Hidden);

+// Enable GFX11+ s_delay_alu insertion
+static cl::opt<bool>
+    EnableInsertDelayAlu("amdgpu-enable-delay-alu",
+                         cl::desc("Enable s_delay_alu insertion"),
+                         cl::init(true), cl::Hidden);
+
 // Option is used in lit tests to prevent deadcoding of patterns inspected.
 static cl::opt<bool>
 EnableDCEInRA("amdgpu-dce-in-ra",
@ -363,6 +369,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
  initializeAMDGPURewriteOutArgumentsPass(*PR);
  initializeAMDGPUUnifyMetadataPass(*PR);
  initializeSIAnnotateControlFlowPass(*PR);
+  initializeAMDGPUInsertDelayAluPass(*PR);
  initializeSIInsertHardClausesPass(*PR);
  initializeSIInsertWaitcntsPass(*PR);
  initializeSIModeRegisterPass(*PR);
@ -1413,6 +1420,10 @@ void GCNPassConfig::addPreEmitPass() {
  // Here we add a stand-alone hazard recognizer pass which can handle all
  // cases.
  addPass(&PostRAHazardRecognizerID);
+
+  if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
+    addPass(&AMDGPUInsertDelayAluID);
+
  addPass(&BranchRelaxationPassID);
 }

--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@ -57,6 +57,7 @@ add_llvm_target(AMDGPUCodeGen
  AMDGPUFrameLowering.cpp
  AMDGPUGlobalISelUtils.cpp
  AMDGPUHSAMetadataStreamer.cpp
+  AMDGPUInsertDelayAlu.cpp
  AMDGPUInstCombineIntrinsic.cpp
  AMDGPUInstrInfo.cpp
  AMDGPUInstructionSelector.cpp
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
@ -2,7 +2,7 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s

 define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX906-LABEL: v_fdot2:
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10NSA %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s

 define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX6-LABEL: gather4_2d:
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
@ -589,6 +589,7 @@ define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX11-NEXT:    ; return to shader part epilog
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

 define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
 ; GFX10-LABEL: sample_d_1d:
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@ -12,9 +12,11 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
 ; GCN-NEXT:    s_mov_b32 exec_lo, s3
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_mov_b32_e32 v4, s1
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GCN-NEXT:    v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
 ; GCN-NEXT:    v_interp_p10_f32 v2, v1, v2, v1
 ; GCN-NEXT:    v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
 ; GCN-NEXT:    exp mrt0 v3, v2, v5, v4 done
 ; GCN-NEXT:    s_endpgm
@ -42,13 +44,16 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
 ; GCN-NEXT:    s_mov_b32 exec_lo, s3
 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GCN-NEXT:    v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
 ; GCN-NEXT:    v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
 ; GCN-NEXT:    v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
 ; GCN-NEXT:    v_interp_p10_f32 v4, v3, v4, v3
 ; GCN-NEXT:    v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GCN-NEXT:    v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
 ; GCN-NEXT:    v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GCN-NEXT:    v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
 ; GCN-NEXT:    exp mrt0 v6, v7, v8, v4 done
 ; GCN-NEXT:    s_endpgm
@ -86,8 +91,10 @@ define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg
 ; GCN-NEXT:    v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
 ; GCN-NEXT:    v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
 ; GCN-NEXT:    v_interp_p10_f32 v0, v5, v0, v5
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GCN-NEXT:    v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
 ; GCN-NEXT:    v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GCN-NEXT:    v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
 ; GCN-NEXT:    v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
 ; GCN-NEXT:    exp mrt0 v6, v7, v8, v0 done
@ -123,9 +130,11 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
 ; GCN-NEXT:    s_mov_b32 exec_lo, s3
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GCN-NEXT:    v_interp_p10_f16_f32 v3, v1, v0, v1
 ; GCN-NEXT:    v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
 ; GCN-NEXT:    v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GCN-NEXT:    v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
 ; GCN-NEXT:    v_add_f16_e32 v0, v3, v0
 ; GCN-NEXT:    ; return to shader part epilog
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@ -68,8 +68,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX11-NEXT:    v_and_or_b32 v5, 0xffff, v7, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_or_b32 v7, 0xffff, v8, v11
 ; GFX11-NEXT:    v_and_or_b32 v6, 0xffff, v10, v9
 ; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
@ -133,8 +135,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v8
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX11-NEXT:    v_and_or_b32 v6, 0xffff, v8, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_or_b32 v8, 0xffff, v9, v12
 ; GFX11-NEXT:    v_and_or_b32 v7, 0xffff, v11, v10
 ; GFX11-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
@ -235,8 +239,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v13
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
 ; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v18, v19, v[15:17], v[5:7], v[8:10]], s[4:7]
@ -359,8 +365,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
 ; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v13, v14, v[15:17], v[4:6]], s[4:7] a16
@ -474,8 +482,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v13
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
 ; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7]
@ -605,8 +615,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
 ; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    image_bvh64_intersect_ray v[0:3], [v[14:15], v16, v[17:19], v[4:6]], s[4:7] a16
@ -984,6 +996,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-NEXT:    s_mov_b32 s4, 0xb36211c7
 ; GFX11-NEXT:    s_movk_i32 s5, 0x102
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v10, s5
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
@ -1123,6 +1136,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-NEXT:    s_mov_b32 s4, 0xb36211c6
 ; GFX11-NEXT:    s_movk_i32 s5, 0x102
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@ -3,7 +3,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s

 define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-LABEL: v_saddsat_i7:
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@ -3,7 +3,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s

 define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i7:
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@ -3,7 +3,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s

 define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i7:
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@ -3,7 +3,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s

 define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-LABEL: v_usubsat_i7:
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@ -265,6 +265,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float a
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_f32_e32 v1, 0.5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v1, 0x80000000, v1
 ; GFX11-NEXT:    v_min_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@ -342,6 +343,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %o
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v1, 0x80000000, v1
 ; GFX11-NEXT:    v_min_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@ -423,6 +425,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, f
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v1, 0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v2, 1.0, v1
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
@ -1650,6 +1653,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, f
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_f32_e32 v1, 0.5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
@ -1788,6 +1792,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %ou
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
@ -1858,6 +1863,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspac
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
@ -2582,6 +2588,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out,
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, 2.0
 ; GFX11-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@ -2664,6 +2671,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out,
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, 0
 ; GFX11-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@ -2818,6 +2826,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
@ -3298,6 +3307,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, flo
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_f32_e64 v0, s4, s5
 ; GFX11-NEXT:    v_add_f32_e64 v1, s4, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e64 v0, v0, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1] offset:12
 ; GFX11-NEXT:    s_endpgm
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@ -476,8 +476,10 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v9, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v10, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_f32_e32 v2, 1.0, v8
 ; GFX11-NEXT:    v_add_f32_e32 v3, 1.0, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v7, v4
--- a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
+++ b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
@ -14,24 +14,31 @@ define amdgpu_ps void @_amdgpu_ps_main(i32 inreg %PrimMask, <2 x float> %InterpC
 ; GCN-NEXT:    lds_param_load v5, attr1.z wait_vdst:15
 ; GCN-NEXT:    lds_param_load v6, attr1.w wait_vdst:15
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32 v7, -1, 0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GCN-NEXT:    v_mbcnt_hi_u32_b32 v7, -1, v7
 ; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GCN-NEXT:    v_interp_p10_f32 v8, v4, v2, v4 wait_exp:2
 ; GCN-NEXT:    v_interp_p10_f32 v10, v5, v2, v5 wait_exp:1
 ; GCN-NEXT:    v_interp_p10_f32 v9, v6, v2, v6
 ; GCN-NEXT:    v_interp_p10_f32 v2, v3, v2, v3 wait_exp:7
 ; GCN-NEXT:    v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GCN-NEXT:    v_interp_p2_f32 v5, v5, v1, v10 wait_exp:7
 ; GCN-NEXT:    v_interp_p2_f32 v6, v6, v1, v9 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GCN-NEXT:    v_interp_p2_f32 v2, v3, v1, v2 wait_exp:7
 ; GCN-NEXT:    v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6]
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GCN-NEXT:    v_mov_b32_dpp v6, v6 dpp8:[1,0,3,2,5,4,7,6]
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v2, v6, vcc_lo
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
 ; GCN-NEXT:    v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6]
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GCN-NEXT:    v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6]
 ; GCN-NEXT:    s_mov_b32 exec_lo, s1
 ; GCN-NEXT:    exp dual_src_blend0 v3, v2, off, off
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@ -67,6 +67,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e64 v1, s0, 4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:1 dlc
@ -154,6 +155,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e64 v1, s0, 4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:1 dlc
@ -241,6 +243,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e64 v1, s0, 4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:1 dlc
@ -311,6 +314,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
 ; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
@ -327,6 +331,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e64 v1, s0, 4
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 2
@ -401,6 +406,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
 ; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
@ -418,6 +424,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e64 v1, s0, 4
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 2
@ -492,6 +499,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 4
 ; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
@ -509,6 +517,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e64 v1, s0, 4
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 2
@ -580,6 +589,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v2, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 4
 ; GFX11-SDAG-NEXT:    scratch_store_b8 v2, v1, off offset:1 dlc
@ -597,6 +607,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e64 v1, s0, 4
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 2
@ -671,6 +682,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v3, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 4
 ; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 dlc
@ -689,6 +701,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e64 v1, s0, 4
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 2
@ -763,6 +776,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 4
 ; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
@ -780,6 +794,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e64 v1, s0, 4
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 2
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@ -54,6 +54,7 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX11-LABEL: zero_init_kernel:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    s_mov_b32 s3, s0
@ -169,6 +170,7 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX11-PAL-LABEL: zero_init_kernel:
 ; GFX11-PAL:       ; %bb.0:
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
@ -231,6 +233,7 @@ define void @zero_init_foo() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    s_mov_b32 s3, s0
@ -304,6 +307,7 @@ define void @zero_init_foo() {
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
@ -681,6 +685,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
 ; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@ -743,6 +748,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
@ -810,6 +816,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX11-NEXT:    scratch_store_b32 v0, v2, s32 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@ -865,6 +872,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX11-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
+; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, s32 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
@ -1021,6 +1029,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    s_mov_b32 s3, s0
@ -1148,6 +1157,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
@ -1219,6 +1229,7 @@ define void @zero_init_small_offset_foo() {
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    s_mov_b32 s3, s0
@ -1300,6 +1311,7 @@ define void @zero_init_small_offset_foo() {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
@ -4217,6 +4229,7 @@ define amdgpu_ps void @large_offset() {
 ; GFX11-LABEL: large_offset:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@ -4317,6 +4330,7 @@ define amdgpu_ps void @large_offset() {
 ; GFX11-PAL-LABEL: large_offset:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, v0
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
@ -0,0 +1,561 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+
+---
+name: valu_dep_1
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}valu_dep_1:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_2
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}valu_dep_2:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_3
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}valu_dep_3:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+    ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+    $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_4
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}valu_dep_4:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+    ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+    ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+    $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+    $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+# There's no encoding for VALU_DEP_5. A normal VALU instruction will have
+# completed already.
+---
+name: valu_dep_5
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}valu_dep_5:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+    ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+    ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
+    ; CHECK-NEXT: v_add_nc_u32_e32 v4, v4, v4
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+    $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+    $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
+    $vgpr4 = V_ADD_U32_e32 $vgpr4, $vgpr4, implicit $exec
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: trans32_dep_1
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}trans32_dep_1:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: trans32_dep_2
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}trans32_dep_2:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+    ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+    $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: trans32_dep_3
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}trans32_dep_3:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+    ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+    ; CHECK-NEXT: v_exp_f32_e32 v2, v2
+    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+    $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+    $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+# There's no encoding for TRANS32_DEP_4. A normal TRANS instruction will have
+# completed already.
+---
+name: trans32_dep_4
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}trans32_dep_4:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+    ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+    ; CHECK-NEXT: v_exp_f32_e32 v2, v2
+    ; CHECK-NEXT: v_exp_f32_e32 v3, v3
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+    $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+    $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
+    $vgpr3 = V_EXP_F32_e32 $vgpr3, implicit $exec, implicit $mode
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: salu_cycle_1
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}salu_cycle_1:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: s_mov_b32 s0, 0
+    ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
+    $sgpr0 = S_MOV_B32 0
+    $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
+...
+
+# There's no need for SALU_CYCLE_2 here because the s_mov will have completed
+# already.
+---
+name: salu_cycle_2
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}salu_cycle_2:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: s_mov_b32 s0, 0
+    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
+    $sgpr0 = S_MOV_B32 0
+    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+    $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_1_same_trans32_dep_1
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}valu_dep_1_same_trans32_dep_1:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
+    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+...
+
+# There's no need to encode the VALU depdendency because it will complete before
+# the TRANS.
+---
+name: trans32_dep_1_only
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}trans32_dep_1_only:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+...
+
+---
+name: valu_dep_1_same_salu_cycle_1
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}valu_dep_1_same_salu_cycle_1:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: s_mov_b32 s0, 0
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $sgpr0 = S_MOV_B32 0
+    $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_1_next_valu_dep_1
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}valu_dep_1_next_valu_dep_1:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_2_next_valu_dep_2
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}valu_dep_2_next_valu_dep_2:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+...
+
+# There's no need to encode a dependency for the second mul, because the
+# dependency for the first mul has already guaranteed that the add has
+# completed.
+---
+name: valu_dep_1_no_next_1
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}valu_dep_1_no_next_1:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+    ; CHECK-NEXT: v_mul_f32_e32 v1, v0, v0
+    ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v0
+    $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
+    $vgpr1 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
+    $vgpr2 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
+...
+
+# There's no need to encode a dependency for the second add, because the
+# dependency for the second mul has already guaranteed that a later VALU has
+# completed.
+---
+name: valu_dep_1_no_next_2
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}valu_dep_1_no_next_2:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
+    ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+    ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
+    ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
+    $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
+    $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
+    $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
+    $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
+...
+
+# There are no wait states between an add/sub/cmp generating carry and an
+# add/sub/cndmask that consumes it, so no need to encode a dependency.
+
+---
+name: implicit_cmp_cndmask
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}implicit_cmp_cndmask:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, v0, v1
+    ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, vcc
+    implicit $vcc = V_CMP_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $vcc, implicit $exec
+...
+
+# TODO: There should be no s_delay_alu here.
+---
+name: explicit_cmp_cndmask
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}explicit_cmp_cndmask:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_cmp_eq_i32_e64 s[0:1], v0, v1
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+    ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1]
+    $sgpr0_sgpr1 = V_CMP_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr0_sgpr1, implicit $exec
+...
+
+---
+name: implicit_addc_addc
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}implicit_addc_addc:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc
+    ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
+    $vgpr0 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+...
+
+---
+name: explicit_addc_addc
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}explicit_addc_addc:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_co_u32 v0, vcc, v0, v0
+    ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
+    $vgpr0,$vcc = V_ADD_CO_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec
+    $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+...
+
+---
+name: valu_dep_3_bundle
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}valu_dep_3_bundle:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+    ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    BUNDLE {
+      $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+      $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+    }
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: if
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}if:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: s_cbranch_vccz .LBB23_2
+    ; CHECK-NEXT: %bb.1:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: .LBB23_2:
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    S_CBRANCH_VCCZ %bb.2, implicit $vcc
+  bb.1:
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+  bb.2:
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: else
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}else:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: s_cbranch_vccz .LBB24_2
+    ; CHECK-NEXT: %bb.1
+    ; CHECK-NEXT: s_branch .LBB24_3
+    ; CHECK-NEXT: .LBB24_2:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: .LBB24_3:
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    S_CBRANCH_VCCZ %bb.2, implicit $vcc
+  bb.1:
+    S_BRANCH %bb.3
+  bb.2:
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+  bb.3:
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: if_else
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}if_else:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: s_cbranch_vccz .LBB25_2
+    ; CHECK-NEXT: %bb.1:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: s_branch .LBB25_3
+    ; CHECK-NEXT: .LBB25_2:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v1
+    ; CHECK-NEXT: .LBB25_3:
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    S_CBRANCH_VCCZ %bb.2, implicit $vcc
+  bb.1:
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    S_BRANCH %bb.3
+  bb.2:
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr0 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+  bb.3:
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+# Dependency from outside the loop.
+---
+name: loop_1
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}loop_1:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: .LBB26_1:
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v0, v0
+    ; CHECK-NEXT: s_cbranch_vccz .LBB26_1
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+  bb.1:
+    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    S_CBRANCH_VCCZ %bb.1, implicit $vcc
+  bb.2:
+...
+
+# Dependency from inside the loop.
+---
+name: loop_2
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}loop_2:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: .LBB27_1:
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    ; CHECK-NEXT: s_cbranch_vccz .LBB27_1
+  bb.1:
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    S_CBRANCH_VCCZ %bb.1, implicit $vcc
+  bb.2:
+...
+
+# No VALU delay across s_sendmsg_rtn because it waits for all outstanding VALU
+# to complete.
+---
+name: sendmsg_rtn
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}sendmsg_rtn:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_mov_b32_e32 v0, 0
+    ; CHECK-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+    ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+    ; CHECK-NEXT: s_add_u32 s0, s0, s0
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr0 = S_SENDMSG_RTN_B32 128
+    $sgpr0 = S_ADD_U32 $sgpr0, $sgpr0, implicit-def $scc
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+# No VALU delay before or across FLAT because it waits for all outstanding VALU
+# to complete.
+---
+name: flat_load
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}flat_load:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_mov_b32_e32 v0, 0
+    ; CHECK-NEXT: v_mov_b32_e32 v1, 0
+    ; CHECK-NEXT: v_mov_b32_e32 v2, 0
+    ; CHECK-NEXT: flat_load_b32 v0, v[0:1]
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v2, v2
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+...
+
+# No VALU delay across an s_waitcnt_depctr that waits for all outstanding VALU
+# to complete.
+---
+name: waitcnt_depctr
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}waitcnt_depctr:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_mov_b32_e32 v0, 0
+    ; CHECK-NEXT: s_waitcnt_depctr 0xfff
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    S_WAITCNT_DEPCTR 4095
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+# Check that no delays are emitted for writelane instructions.
+---
+name: writelane1
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}writelane1:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_writelane_b32 v0, s0, 0
+    ; CHECK-NEXT: v_writelane_b32 v0, s0, 1
+    ; CHECK-NEXT: v_writelane_b32 v0, s0, 2
+    ; CHECK-NEXT: v_writelane_b32 v0, s0, 3
+    $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0
+    $vgpr0 = V_WRITELANE_B32 $sgpr0, 1, $vgpr0
+    $vgpr0 = V_WRITELANE_B32 $sgpr0, 2, $vgpr0
+    $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
+...
+
+# Check if a VALU delay is added after writelane.
+---
+name: writelane2
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}writelane2:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_writelane_b32 v0, s0, 3
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+    $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@ -388,6 +388,7 @@
 ; GCN-O1-NEXT:        SI Final Branch Preparation
 ; GCN-O1-NEXT:        SI peephole optimizations
 ; GCN-O1-NEXT:        Post RA hazard recognizer
+; GCN-O1-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O1-NEXT:        Branch relaxation pass
 ; GCN-O1-NEXT:        Register Usage Information Collector Pass
 ; GCN-O1-NEXT:        Live DEBUG_VALUE analysis
@ -676,6 +677,7 @@
 ; GCN-O1-OPTS-NEXT:        SI Final Branch Preparation
 ; GCN-O1-OPTS-NEXT:        SI peephole optimizations
 ; GCN-O1-OPTS-NEXT:        Post RA hazard recognizer
+; GCN-O1-OPTS-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O1-OPTS-NEXT:        Branch relaxation pass
 ; GCN-O1-OPTS-NEXT:        Register Usage Information Collector Pass
 ; GCN-O1-OPTS-NEXT:        Live DEBUG_VALUE analysis
@ -966,6 +968,7 @@
 ; GCN-O2-NEXT:        SI Final Branch Preparation
 ; GCN-O2-NEXT:        SI peephole optimizations
 ; GCN-O2-NEXT:        Post RA hazard recognizer
+; GCN-O2-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O2-NEXT:        Branch relaxation pass
 ; GCN-O2-NEXT:        Register Usage Information Collector Pass
 ; GCN-O2-NEXT:        Live DEBUG_VALUE analysis
@ -1268,6 +1271,7 @@
 ; GCN-O3-NEXT:        SI Final Branch Preparation
 ; GCN-O3-NEXT:        SI peephole optimizations
 ; GCN-O3-NEXT:        Post RA hazard recognizer
+; GCN-O3-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O3-NEXT:        Branch relaxation pass
 ; GCN-O3-NEXT:        Register Usage Information Collector Pass
 ; GCN-O3-NEXT:        Live DEBUG_VALUE analysis
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
@ -86,6 +86,7 @@ define amdgpu_kernel void @id_row_i32() #0 {
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x63
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    s_mov_b32 m0, s0
 ; GFX11-SDAG-NEXT:    exp pos0 v0, off, off, off done row_en
 ; GFX11-SDAG-NEXT:    s_endpgm
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@ -15,6 +15,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
 ; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dot2_f32_bf16 v0, s2, s3, v0 clamp
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
@ -43,6 +44,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
 ; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dot2_f32_bf16 v0, s2, s3, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=NOPRT %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s

 define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
 ; VERDE-LABEL: load_1d:
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

 define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
 ; GFX9-LABEL: gather4_2d:
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

 define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
 ; GFX9-LABEL: sample_1d:
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@ -3,7 +3,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s

 define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; TONGA-LABEL: image_sample_2d_f16:
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s

 define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
 ; VERDE-LABEL: sample_1d:
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
@ -34,6 +34,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04]
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04]
 ; GFX11-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x00]
@ -62,8 +63,10 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX11-NEXT:    v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e]
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e]
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x93,0x00,0x87,0xbf]
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00]
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x09,0x04]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf]
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x56,0xd6,0x01,0x21,0x01,0x04]
 ; GFX11-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x0f,0xe4,0xf0,0x02,0x00,0x00,0x08]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
@ -105,6 +108,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00]
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
 ; GFX11-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x56,0xd6,0x04,0x21,0x0d,0x04]
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x56,0xd6,0x02,0x21,0x05,0x04]
 ; GFX11-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06]
@ -147,6 +151,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04]
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04]
 ; GFX11-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x06]
@ -193,6 +198,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00]
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x01,0x04]
 ; GFX11-NEXT:    v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x56,0xd6,0x08,0x21,0x05,0x04]
 ; GFX11-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x0f,0x50,0xf1,0x02,0x00,0x00,0x08]
@ -226,6 +232,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00]
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
 ; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04]
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04]
 ; GFX11-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x04,0xf0,0xf0,0x02,0x00,0x00,0x08]
@ -259,6 +266,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00]
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
 ; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04]
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04]
 ; GFX11-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x06,0xf0,0xf0,0x02,0x00,0x00,0x08]
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

 define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
 ; GFX10-LABEL: sample_d_1d:
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@ -12,9 +12,11 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
 ; GCN-NEXT:    s_mov_b32 exec_lo, s3
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_mov_b32_e32 v4, s1
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GCN-NEXT:    v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
 ; GCN-NEXT:    v_interp_p10_f32 v2, v1, v2, v1
 ; GCN-NEXT:    v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
 ; GCN-NEXT:    exp mrt0 v3, v2, v5, v4 done
 ; GCN-NEXT:    s_endpgm
@ -42,13 +44,16 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
 ; GCN-NEXT:    s_mov_b32 exec_lo, s3
 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GCN-NEXT:    v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
 ; GCN-NEXT:    v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
 ; GCN-NEXT:    v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
 ; GCN-NEXT:    v_interp_p10_f32 v4, v3, v4, v3
 ; GCN-NEXT:    v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GCN-NEXT:    v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
 ; GCN-NEXT:    v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GCN-NEXT:    v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
 ; GCN-NEXT:    exp mrt0 v6, v7, v8, v4 done
 ; GCN-NEXT:    s_endpgm
@ -86,8 +91,10 @@ define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg
 ; GCN-NEXT:    v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
 ; GCN-NEXT:    v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
 ; GCN-NEXT:    v_interp_p10_f32 v0, v5, v0, v5
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GCN-NEXT:    v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
 ; GCN-NEXT:    v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GCN-NEXT:    v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
 ; GCN-NEXT:    v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
 ; GCN-NEXT:    exp mrt0 v6, v7, v8, v0 done
@ -123,9 +130,11 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
 ; GCN-NEXT:    s_mov_b32 exec_lo, s3
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GCN-NEXT:    v_interp_p10_f16_f32 v3, v1, v0, v1
 ; GCN-NEXT:    v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
 ; GCN-NEXT:    v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GCN-NEXT:    v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
 ; GCN-NEXT:    v_add_f16_e32 v0, v3, v0
 ; GCN-NEXT:    ; return to shader part epilog
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@ -233,6 +233,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 2.0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, s4, s4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s4
 ; GFX11-NEXT:    v_add_co_u32 v2, s4, s6, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s4
@ -325,6 +326,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
 ; GFX11-NEXT:    v_mov_b32_e32 v5, 2.0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, s4, s4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s4
 ; GFX11-NEXT:    v_add_co_u32 v2, s4, s6, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s4
@ -428,6 +430,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
 ; GFX11-NEXT:    v_mov_b32_e32 v10, 0x102
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, s4, s4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s4
 ; GFX11-NEXT:    flat_load_b32 v11, v[0:1]
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40c00000
@ -515,6 +518,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
 ; GFX11-NEXT:    v_mov_b32_e32 v7, 0x102
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, s4, s4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s4
 ; GFX11-NEXT:    flat_load_b32 v8, v[0:1]
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x46004200
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@ -14,6 +14,7 @@ define amdgpu_kernel void @test_s(i32 addrspace(1)* %out, i32 %src0) {
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane64_b32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
@ -28,6 +29,7 @@ define amdgpu_kernel void @test_i(i32 addrspace(1)* %out) {
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x63
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_permlane64_b32 v0, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@ -83,20 +83,25 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v4, v3, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v5, v2, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v5, v3, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX11-NEXT:    v_mul_lo_u32 v5, v5, v2
 ; GFX11-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v4, v5
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
@ -223,31 +228,40 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v4, v3, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v5, v2, 0
 ; GFX11-NEXT:    v_mad_i64_i32 v[11:12], null, v5, v3, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
 ; GFX11-NEXT:    v_mul_lo_u32 v8, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v9
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
 ; GFX11-NEXT:    v_mul_lo_u32 v9, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v6, v2
 ; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v9, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v10, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
@ -372,6 +386,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_add_i32 s1, s1, s6
 ; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
@ -548,8 +563,10 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX11-NEXT:    s_cmp_lt_i32 s3, 0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, s4, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
 ; GFX11-NEXT:    v_sub_co_u32 v3, vcc_lo, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
 ; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX11-NEXT:    s_add_i32 s1, s8, s7
@ -558,7 +575,9 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mov_b32 s5, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
@ -617,9 +636,11 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
 ; GFX11-NEXT:    v_alignbit_b32 v3, v1, v0, 30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@ -677,8 +698,10 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
 ; GFX11-NEXT:    v_alignbit_b32 v3, v1, v0, 30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@ -34,6 +34,7 @@ define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %sext0 = sext i32 %arg0 to i64
@ -71,6 +72,7 @@ define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %sext0 = sext i32 %arg0 to i64
@ -108,6 +110,7 @@ define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %sext0 = zext i32 %arg0 to i64
@ -145,6 +148,7 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %sext0 = zext i32 %arg0 to i64
@ -248,22 +252,29 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v14, 31, v0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v15, 31, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
 ; GFX11-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX11-NEXT:    v_mov_b32_e32 v10, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10]
 ; GFX11-NEXT:    v_mad_i64_i32 v[9:10], null, v1, v14, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v8, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10]
 ; GFX11-NEXT:    v_add_co_u32 v7, s0, v7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, 0, s0
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8]
 ; GFX11-NEXT:    v_mov_b32_e32 v7, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v12
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %sext0 = sext i32 %arg0 to i128
@ -301,6 +312,7 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %sext0 = sext i32 %arg0 to i63
@ -346,6 +358,7 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_bfe_i32 v4, v1, 0, 31
 ; GFX11-NEXT:    v_bfe_i32 v5, v0, 0, 31
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %sext0 = sext i31 %arg0 to i63
@ -394,9 +407,11 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %ext0 = sext i32 %arg0 to i64
@ -433,6 +448,7 @@ define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %trunc.lhs = and i64 %arg0, 4294967295
@ -481,8 +497,10 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
 ; GFX11-NEXT:    v_and_b32_e32 v5, 1, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@ -532,9 +550,11 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
 ; GFX11-NEXT:    v_and_b32_e32 v4, 1, v3
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %trunc.lhs = and i64 %arg0, 4294967295
@ -571,6 +591,7 @@ define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v3, v2, v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %shl.lhs = shl i64 %arg0, 32
@ -610,6 +631,7 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@ -731,6 +753,7 @@ define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mad_i64_i32 v[6:7], null, v0, v1, v[2:3]
 ; GFX11-NEXT:    v_mad_i64_i32 v[2:3], null, v0, v1, v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v6, v2
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v7, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@ -794,14 +817,17 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mad_i64_i32 v[8:9], null, v0, v1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v4
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v8, v6
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@ -852,8 +878,10 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mad_i64_i32 v[4:5], null, v0, v1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@ -908,9 +936,11 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5]
 ; GFX11-NEXT:    v_mul_lo_u32 v3, v7, v3
 ; GFX11-NEXT:    v_mul_lo_u32 v2, v6, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v2, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %m = mul i48 %arg0, %arg1
--- a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx900 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s

 define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GFX9-LABEL: mad_i32_vvv:
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@ -269,6 +269,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-WGP-NEXT:    v_add_co_u32 v0, s0, s0, v0
 ; GFX11-WGP-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] slc dlc
@ -283,6 +284,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-CU-NEXT:    v_add_co_u32 v0, s0, s0, v0
 ; GFX11-CU-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] slc dlc
@ -561,6 +563,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-WGP-NEXT:    v_add_co_u32 v0, s0, s2, v0
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[1:2]
 ; GFX11-WGP-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
@ -575,6 +578,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-CU-NEXT:    v_add_co_u32 v0, s0, s2, v0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[1:2]
 ; GFX11-CU-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@ -165,6 +165,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-WGP-NEXT:    v_add_co_u32 v0, s0, s0, v0
 ; GFX11-WGP-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc dlc
@ -180,6 +181,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-CU-NEXT:    v_add_co_u32 v0, s0, s0, v0
 ; GFX11-CU-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc dlc
@ -359,6 +361,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-WGP-NEXT:    v_add_co_u32 v0, s0, s2, v0
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[1:2]
 ; GFX11-WGP-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
@ -374,6 +377,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-CU-NEXT:    v_add_co_u32 v0, s0, s2, v0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[1:2]
 ; GFX11-CU-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0