diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index ff73705b04ed..8fe4ac12d339 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -41,6 +41,7 @@ add_llvm_target(RISCVCodeGen RISCVTargetTransformInfo.cpp VentusRegextInsertion.cpp VentusVVInstrConversion.cpp + VentusInsertJoinToVBranch.cpp GISel/RISCVCallLowering.cpp GISel/RISCVInstructionSelector.cpp GISel/RISCVLegalizerInfo.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index c0a3f18b9ffb..217756afa8a9 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -72,6 +72,9 @@ void initializeVentusRegextInsertionPass(PassRegistry &); FunctionPass *createVentusVVInstrConversionPass(); void initializeVentusVVInstrConversionPass(PassRegistry &); +FunctionPass *createVentusInsertJoinToVBranchPass(); +void initializeVentusInsertJoinToVBranchPass(PassRegistry &); + InstructionSelector *createRISCVInstructionSelector(const RISCVTargetMachine &, RISCVSubtarget &, RISCVRegisterBankInfo &); diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 0e831a897430..a87e65e3942b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -286,6 +286,7 @@ void RISCVPassConfig::addPreEmitPass2() { // Insert regext instruction for instruction whose register id is greater // than 31. addPass(createVentusRegextInsertionPass()); + addPass(createVentusInsertJoinToVBranchPass()); } void RISCVPassConfig::addMachineSSAOptimization() { @@ -302,11 +303,13 @@ void RISCVPassConfig::addPreRegAlloc() { if (TM->getOptLevel() != CodeGenOpt::None) addPass(createRISCVMergeBaseOffsetOptPass()); addPass(createVentusVVInstrConversionPass()); + } void RISCVPassConfig::addPostRegAlloc() { if (TM->getOptLevel() != CodeGenOpt::None && EnableRedundantCopyElimination) addPass(createRISCVRedundantCopyEliminationPass()); + } yaml::MachineFunctionInfo * diff --git a/llvm/lib/Target/RISCV/VentusInsertJoinToVBranch.cpp b/llvm/lib/Target/RISCV/VentusInsertJoinToVBranch.cpp new file mode 100644 index 000000000000..a6e47b62472d --- /dev/null +++ b/llvm/lib/Target/RISCV/VentusInsertJoinToVBranch.cpp @@ -0,0 +1,388 @@ +//===-- VentusInsertJoinToBranch.cpp - Insert join to VBranches -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// In Ventus, if VBranch instructions are generated, we need to insert join +// instructions in both `else` and `then` branch to tell hardware where these +// two branches need to join together +// +// we follow the following rules to insert join block and join instruction +// +// 1: Legalize all the return block +// when there are one more return blocks in machine function, there must be +// branches, we need to reduce return blocks number down to 1 +// 1.1: If two return blocks have common nearest parent branch, this two blocks +// need to be joined, and we add a hasBeenJoined marker for this parent +// branch +// 1.2: after we complete 1.1 process, there maybe one more return blocks, we +// need to further add join block, we recursively build dominator tree for +// these return blocks, first we find the nearest common dominator branch for +// two return blocks, and then get dominator tree path between dominator +// and each return block, we need to check this path in which whether any +// other branch blocks exists, ideally, the branch block in path should have +// been joined and marked, if not, this path is illegal, these two block can +// not be joined +// +// 2: Insert join instructions +// 2.1: we scan through the MachineBasic blocks and check what blocks to insert +// join instruction, below MBB represents MachineBasic Block +// 2.2: The MBB must have one more predecessors and its nearest dominator must +// be a VBranch +// 2.3: Then we analyze the the predecessor of MBB, if the predecessor +// has single successor, we add a join instruction to the predecessor end, +// other wise, we need to insert a join block between predecessor and MBB +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVInstrInfo.h" +#include "RISCVTargetMachine.h" +#include "llvm/CodeGen/CodeGenPassBuilder.h" +#include "llvm/CodeGen/MachineDominators.h" + +#define VENTUS_INSERT_JOIN_TO_BRANCH "Insert join to VBranch" +#define DEBUG_TYPE "Insert_join_to_VBranch" + +using namespace llvm; + +namespace { + +struct BranchInfo { + bool isDivergentBranch = false; // MBB is divergent branch or not + bool hasBeenJoined = false; // MBB has been joined +}; + +class VentusInsertJoinToVBranch : public MachineFunctionPass { + +public: + const RISCVInstrInfo *TII; + static char ID; + MachineFunction *MachineFunc; + const RISCVRegisterInfo *MRI; + const MachineRegisterInfo *MR; + SmallVector ReturnBlock; + SmallDenseMap BranchMBBInfo; + MachineDominatorTree *MDT = new MachineDominatorTree(); + + VentusInsertJoinToVBranch() : MachineFunctionPass(ID) { + initializeVentusInsertJoinToVBranchPass(*PassRegistry::getPassRegistry()); + } + + ~VentusInsertJoinToVBranch() { delete MDT; } + + // Collect all the branch blocks information in function + void collectBranchMBBInfo(MachineFunction &MF); + + bool insertJoinMBB(MachineBasicBlock &MBB1, MachineBasicBlock &MBB2); + + bool runOnMachineFunction(MachineFunction &MF) override; + + bool legalizeRetMBB(MachineBasicBlock &MBB); + + bool hasCommonNearestParentBranch(MachineBasicBlock &MBB1, + MachineBasicBlock &MBB2); + + bool canJoinMBB(MachineBasicBlock &MBB1, MachineBasicBlock &MBB2); + + /// This function check two return blocks whether can join or not + bool hasNoUnjoinedBranch(MachineBasicBlock *CurrMBB, + MachineBasicBlock *TargetMBB); + + /// Find all the branch predecessor no matter direct or indirect + SmallVector + findAllNearestParentBranches(MachineBasicBlock &MBB); + + /// Check MBB is divergent branch or not + bool isDivergentBranchBlock(MachineBasicBlock &MBB) { + if (MBB.empty()) + return false; + + const MachineInstr &MI = MBB.instr_back(); + switch (MI.getOpcode()) { + default: + return false; + case RISCV::VBEQ: + case RISCV::VBNE: + case RISCV::VBLT: + case RISCV::VBGE: + case RISCV::VBLTU: + case RISCV::VBGEU: + return true; + } + } + + /// Check MBB is common branch or not + bool isCommonBranchBlock(MachineBasicBlock &MBB) { + + if (MBB.empty()) + return false; + const MachineInstr &MI = MBB.instr_back(); + switch (MI.getOpcode()) { + default: + return false; + case RISCV::BEQ: + case RISCV::BNE: + case RISCV::BLT: + case RISCV::BGE: + case RISCV::BLTU: + case RISCV::BGEU: + return true; + } + } + // virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + // AU.addRequired(); + // AU.setPreservesAll(); + // } + /// Legalize all the return MBB + bool canJoinRetMBB(MachineFunction &MF); + + /// Get return MBB numbers + unsigned getReturnBlockNum(MachineFunction &MF); + + StringRef getPassName() const override { + return VENTUS_INSERT_JOIN_TO_BRANCH; + } +}; + +char VentusInsertJoinToVBranch::ID = 0; + +void VentusInsertJoinToVBranch::collectBranchMBBInfo(MachineFunction &MF) { + for (auto &MBB : MF) { + if (isCommonBranchBlock(MBB)) + BranchMBBInfo[&MBB] = {false, false}; + + else if (isDivergentBranchBlock(MBB)) + BranchMBBInfo[&MBB] = {true, false}; + } +} + +unsigned VentusInsertJoinToVBranch::getReturnBlockNum(MachineFunction &MF) { + // Clear return block before each analysis + if (!ReturnBlock.empty()) + ReturnBlock.clear(); + unsigned ReturnBlockNum = 0; + for (auto &MBB : MF) { + if (MBB.isReturnBlock()) { + // Original return blocks + ReturnBlock.push_back(&MBB); + ReturnBlockNum++; + } + } + return ReturnBlockNum; +} + +bool VentusInsertJoinToVBranch::insertJoinMBB(MachineBasicBlock &MBB1, + MachineBasicBlock &MBB2) { + MachineBasicBlock *PseudoJoinMBB = MachineFunc->CreateMachineBasicBlock(); + BuildMI(*PseudoJoinMBB, PseudoJoinMBB->end(), DebugLoc(), + TII->get(RISCV::PseudoRET)); + MachineFunc->push_back(PseudoJoinMBB); + legalizeRetMBB(MBB1); + legalizeRetMBB(MBB2); + MBB1.addSuccessor(PseudoJoinMBB); + MBB2.addSuccessor(PseudoJoinMBB); + return true; +} + +/// Check if two return blocks can join or not +bool VentusInsertJoinToVBranch::canJoinMBB(MachineBasicBlock &MBB1, + MachineBasicBlock &MBB2) { + auto DominatorBlock = MDT->findNearestCommonDominator(&MBB1, &MBB2); + if (DominatorBlock) { + if (!hasNoUnjoinedBranch(DominatorBlock, &MBB1) && + !hasNoUnjoinedBranch(DominatorBlock, &MBB2)) { + BranchMBBInfo.find(DominatorBlock)->second.hasBeenJoined = true; + return true; + } + } + return false; +} + +bool VentusInsertJoinToVBranch::runOnMachineFunction(MachineFunction &MF) { + + bool IsChanged = false; + TII = static_cast(MF.getSubtarget().getInstrInfo()); + MRI = MF.getSubtarget().getRegisterInfo(); + MR = &MF.getRegInfo(); + MachineFunc = &MF; + collectBranchMBBInfo(MF); + MDT->getBase().recalculate(*MachineFunc); + + // After this check, all return blocks are expected to be legal + IsChanged |= canJoinRetMBB(MF); + MDT->getBase().recalculate(*MachineFunc); + // assert(getReturnBlockNum(MF) == 1 && "Join return MBB process not + // completed"); + for (auto &MBB : make_early_inc_range(MF)) { + MachineDomTreeNode *Node = MDT->getNode(&MBB); + if (Node && Node->getIDom()) { + // At least two predecessors + unsigned PredecessorNum = std::distance(MBB.pred_begin(), MBB.pred_end()); + if (BranchMBBInfo.find(Node->getIDom()->getBlock()) != + BranchMBBInfo.end() && + BranchMBBInfo.find(Node->getIDom()->getBlock()) + ->getSecond() + .isDivergentBranch && + PredecessorNum > 1) { + SmallVector Predecessors; + for (auto Pred : MBB.predecessors()) + Predecessors.push_back(Pred); + for (auto Predecessor : make_early_inc_range(Predecessors)) { + // Divergent branch, insert a block between MBB & predecessor + if (isDivergentBranchBlock(*Predecessor)) { + + MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(); + // This is essential to keep CFG legal, if MBB is the fall through + // block of predecessor, the NewBB should replace MBB's place + // otherwise, we only need to insert before MBB + if (Predecessor->getFallThrough() == &MBB) + MF.insert(std::next(Predecessor->getIterator()), NewBB); + else + MF.insert(MBB.getIterator(), NewBB); + Predecessor->replaceSuccessor(&MBB, NewBB); + NewBB->addSuccessor(&MBB); + + BuildMI(*NewBB, NewBB->end(), DebugLoc(), TII->get(RISCV::JOIN)) + .addMBB(&MBB); + MachineInstr *LastInst = &(*Predecessor->getFirstInstrTerminator()); + assert(LastInst->isBranch() && "Not branch instruction"); + LastInst->getOperand(2).setMBB(NewBB); + + } else { + // Avoid duplicate JOIN add + if (!(Predecessor->instr_back().getOpcode() == RISCV::JOIN)) + BuildMI(*Predecessor, Predecessor->end(), DebugLoc(), + TII->get(RISCV::JOIN)) + .addMBB(&MBB); + } + } + } + } + } + return IsChanged; +} + +bool VentusInsertJoinToVBranch::canJoinRetMBB(MachineFunction &MF) { + bool IsChanged = false; + // Check two MBBs' nearest parent branch MBB is the same or not, if is same + // we need to join them to a maybe Joint block. otherwise + unsigned ReturnBlockNum = getReturnBlockNum(MF); + for (size_t i = 0; i < ReturnBlockNum; i++) { + for (size_t j = i + 1; j < ReturnBlockNum; j++) { + if (hasCommonNearestParentBranch(*ReturnBlock[i], *ReturnBlock[j])) + IsChanged |= insertJoinMBB(*ReturnBlock[i], *ReturnBlock[j]); + } + } + // Rebuild dominator tree + MDT->getBase().recalculate(MF); + unsigned RetNum = getReturnBlockNum(MF); + while (true) { + for (size_t i = 0; i < ReturnBlock.size(); i++) { + for (size_t j = i + 1; j < ReturnBlock.size(); j++) { + if (canJoinMBB(*ReturnBlock[i], *ReturnBlock[j])) + IsChanged |= insertJoinMBB(*ReturnBlock[i], *ReturnBlock[j]); + } + } + // After check, rebuild dominator tree + MDT->getBase().recalculate(MF); + unsigned RetNum1 = getReturnBlockNum(MF); + if (RetNum1 == RetNum) + // Avoid dead loop + break; + RetNum = RetNum1; + } + + return IsChanged; +} + +/// Legalize return block, right now, we only consider tail call && ret +bool VentusInsertJoinToVBranch::legalizeRetMBB(MachineBasicBlock &MBB) { + // Get last instruction in this basic block + if (MBB.empty()) + return false; + MachineInstr *LastInst = &(*MBB.instr_rbegin()); + unsigned LastInstOpcode = LastInst->getOpcode(); + assert(LastInstOpcode == RISCV::PseudoRET || + LastInstOpcode == RISCV::PseudoTAIL && "Unexpected opcode"); + // If opcode is RISCV::PseudoRET, keep a copy of this instruction + if (LastInstOpcode == RISCV::PseudoRET) + // Get the return instruction's implicit operands + LastInst->eraseFromParent(); + + else + LastInst->setDesc(TII->get(RISCV::PseudoCALL)); + return true; +} + +bool VentusInsertJoinToVBranch::hasCommonNearestParentBranch( + MachineBasicBlock &MBB1, MachineBasicBlock &MBB2) { + auto ParentBranches1 = findAllNearestParentBranches(MBB1); + auto ParentBranches2 = findAllNearestParentBranches(MBB2); + for (auto Branch : ParentBranches1) { + if (std::find(ParentBranches2.begin(), ParentBranches2.end(), Branch) != + ParentBranches2.end()) { + auto BranchMBB = + std::find(ParentBranches2.begin(), ParentBranches2.end(), Branch); + if (BranchMBBInfo.find(*BranchMBB) != BranchMBBInfo.end()) + // Update BranchMBB's hasBeenJoined flag + BranchMBBInfo.find(*BranchMBB)->getSecond().hasBeenJoined = true; + return true; + } + } + return false; +} + +SmallVector +VentusInsertJoinToVBranch::findAllNearestParentBranches( + MachineBasicBlock &MBB) { + SmallVector BranchParents; + + for (auto Pred : MBB.predecessors()) { + unsigned PredNum = std::distance(Pred->succ_begin(), Pred->succ_end()); + if (PredNum >= 2) + BranchParents.push_back(Pred); + else { + auto Parents = findAllNearestParentBranches(*Pred); + BranchParents.insert(BranchParents.end(), Parents.begin(), Parents.end()); + } + } + + return BranchParents; +} + +bool VentusInsertJoinToVBranch::hasNoUnjoinedBranch( + MachineBasicBlock *DominatorMBB, MachineBasicBlock *TargetMBB) { + // Find the path between MBB1 and its immediate dominator + MachineDomTreeNode *TargetMBBNode = MDT->getNode(TargetMBB); + SmallVector Path; + // Build path between dominator DominatorMBB and TargetMBB + // FIXME: Maybe can simplify below codes + while (TargetMBBNode && TargetMBBNode->getBlock() != DominatorMBB && + TargetMBBNode->getIDom()->getBlock() != DominatorMBB) { + Path.push_back(TargetMBBNode->getBlock()); + TargetMBBNode = TargetMBBNode->getIDom(); + } + // Traverse this path, if found unjoined branch, return true + for (auto path : Path) { + if (BranchMBBInfo.find(path) != BranchMBBInfo.end()) { + if (!BranchMBBInfo.find(path)->getSecond().hasBeenJoined) + return true; + } + } + return false; +} + +} // end of anonymous namespace + +INITIALIZE_PASS(VentusInsertJoinToVBranch, "Insert-join-to-VBranch", + VENTUS_INSERT_JOIN_TO_BRANCH, false, false) + +namespace llvm { +FunctionPass *createVentusInsertJoinToVBranchPass() { + return new VentusInsertJoinToVBranch(); +} +} // end of namespace llvm diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll new file mode 100644 index 000000000000..3bbd1bff562c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll @@ -0,0 +1,395 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mcpu=ventus-gpgpu -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=VENTUS %s + +; Function Attrs: convergent mustprogress nofree norecurse nounwind willreturn memory(none) vscale_range(1,2048) +define dso_local i32 @branch(i32 noundef %dim) local_unnamed_addr { +; VENTUS-LABEL: branch: +; VENTUS: # %bb.0: # %entry +; VENTUS-NEXT: addi tp, tp, 16 +; VENTUS-NEXT: .cfi_def_cfa_offset 16 +; VENTUS-NEXT: sw ra, -16(sp) # 4-byte Folded Spill +; VENTUS-NEXT: .cfi_offset ra, 0 +; VENTUS-NEXT: vmv.v.x v0, zero +; VENTUS-NEXT: call _Z13get_global_idj +; VENTUS-NEXT: li a1, 14 +; VENTUS-NEXT: vmv.v.x v1, a1 +; VENTUS-NEXT: li a1, 13 +; VENTUS-NEXT: vblt v0, v1, .LBB0_5 +; VENTUS-NEXT: # %bb.1: # %if.else +; VENTUS-NEXT: vmv.x.s a0, v0 +; VENTUS-NEXT: li a1, 18 +; VENTUS-NEXT: bgeu a0, a1, .LBB0_3 +; VENTUS-NEXT: join v0, v0, .LBB0_2 +; VENTUS-NEXT: .LBB0_5: +; VENTUS-NEXT: join v0, v0, .LBB0_2 +; VENTUS-NEXT: .LBB0_2: # %cleanup +; VENTUS-NEXT: vmv.v.x v0, a1 +; VENTUS-NEXT: lw ra, -16(sp) # 4-byte Folded Reload +; VENTUS-NEXT: addi tp, tp, -16 +; VENTUS-NEXT: join v0, v0, .LBB0_4 +; VENTUS-NEXT: .LBB0_3: # %if.end3 +; VENTUS-NEXT: li a0, 4 +; VENTUS-NEXT: vmv.v.x v0, a0 +; VENTUS-NEXT: lw ra, -16(sp) # 4-byte Folded Reload +; VENTUS-NEXT: addi tp, tp, -16 +; VENTUS-NEXT: call _Z13get_global_idj +; VENTUS-NEXT: join v0, v0, .LBB0_4 +; VENTUS-NEXT: .LBB0_4: +; VENTUS-NEXT: ret +entry: + %call = tail call i32 @_Z13get_global_idj(i32 noundef 0) + %cmp = icmp slt i32 %call, 14 + br i1 %cmp, label %cleanup, label %if.else + +if.else: ; preds = %entry + %cmp1 = icmp ult i32 %call, 18 + br i1 %cmp1, label %cleanup, label %if.end3 + +if.end3: ; preds = %if.else + %call4 = tail call i32 @_Z13get_global_idj(i32 noundef 4) + br label %cleanup + +cleanup: ; preds = %if.else, %entry, %if.end3 + %retval.0 = phi i32 [ %call4, %if.end3 ], [ 13, %entry ], [ 18, %if.else ] + ret i32 %retval.0 +} + +define dso_local spir_kernel void @loop_branch(ptr addrspace(1) nocapture noundef align 4 %A, ptr addrspace(1) nocapture noundef readonly align 4 %B) { +; VENTUS-LABEL: loop_branch: +; VENTUS: # %bb.0: # %entry +; VENTUS-NEXT: addi sp, sp, 16 +; VENTUS-NEXT: .cfi_def_cfa_offset 16 +; VENTUS-NEXT: sw ra, -12(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw s0, -16(sp) # 4-byte Folded Spill +; VENTUS-NEXT: .cfi_offset ra, 4 +; VENTUS-NEXT: .cfi_offset s0, 0 +; VENTUS-NEXT: mv s0, a0 +; VENTUS-NEXT: vmv.v.x v0, zero +; VENTUS-NEXT: call _Z13get_global_idj +; VENTUS-NEXT: vmv.x.s a0, v0 +; VENTUS-NEXT: vmv.v.x v0, zero +; VENTUS-NEXT: vmv.v.x v1, a0 +; VENTUS-NEXT: vbeq v1, v0, .LBB1_4 +; VENTUS-NEXT: # %bb.1: # %for.body.lr.ph +; VENTUS-NEXT: lw a3, 4(s0) +; VENTUS-NEXT: lw a1, 0(s0) +; VENTUS-NEXT: slli a4, a0, 2 +; VENTUS-NEXT: add a1, a1, a4 +; VENTUS-NEXT: lw a2, 0(a1) +; VENTUS-NEXT: add a3, a3, a4 +; VENTUS-NEXT: .LBB1_2: # %for.body +; VENTUS-NEXT: # =>This Inner Loop Header: Depth=1 +; VENTUS-NEXT: lw a4, 0(a3) +; VENTUS-NEXT: add a2, a2, a4 +; VENTUS-NEXT: addi a0, a0, -1 +; VENTUS-NEXT: sw a2, 0(a1) +; VENTUS-NEXT: bnez a0, .LBB1_2 +; VENTUS-NEXT: join v0, v0, .LBB1_3 +; VENTUS-NEXT: .LBB1_4: +; VENTUS-NEXT: join v0, v0, .LBB1_3 +; VENTUS-NEXT: .LBB1_3: # %for.cond.cleanup +; VENTUS-NEXT: lw ra, -12(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw s0, -16(sp) # 4-byte Folded Reload +; VENTUS-NEXT: addi sp, sp, -16 +; VENTUS-NEXT: ret +entry: + %call = tail call i32 @_Z13get_global_idj(i32 noundef 0) + %cmp5.not = icmp eq i32 %call, 0 + br i1 %cmp5.not, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %B, i32 %call + %arrayidx1 = getelementptr inbounds i32, ptr addrspace(1) %A, i32 %call + %.pre = load i32, ptr addrspace(1) %arrayidx1, align 4 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %add, %for.body ] + %i.06 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %1 = load i32, ptr addrspace(1) %arrayidx, align 4 + %add = add nsw i32 %0, %1 + store i32 %add, ptr addrspace(1) %arrayidx1, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond.not = icmp eq i32 %inc, %call + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local i32 @branch_in_branch(i32 noundef %dim) local_unnamed_addr { +; VENTUS-LABEL: branch_in_branch: +; VENTUS: # %bb.0: # %entry +; VENTUS-NEXT: addi tp, tp, 16 +; VENTUS-NEXT: .cfi_def_cfa_offset 16 +; VENTUS-NEXT: sw ra, -12(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw s0, -16(sp) # 4-byte Folded Spill +; VENTUS-NEXT: .cfi_offset ra, 4 +; VENTUS-NEXT: .cfi_offset s0, 0 +; VENTUS-NEXT: vmv.v.x v0, zero +; VENTUS-NEXT: call _Z13get_global_idj +; VENTUS-NEXT: li a0, 14 +; VENTUS-NEXT: vmv.v.x v1, a0 +; VENTUS-NEXT: li a0, 13 +; VENTUS-NEXT: vblt v0, v1, .LBB2_9 +; VENTUS-NEXT: # %bb.1: # %if.else +; VENTUS-NEXT: vmv.x.s s0, v0 +; VENTUS-NEXT: li a0, 17 +; VENTUS-NEXT: bltu a0, s0, .LBB2_4 +; VENTUS-NEXT: # %bb.2: # %if.then2 +; VENTUS-NEXT: li a0, 1 +; VENTUS-NEXT: vmv.v.x v0, a0 +; VENTUS-NEXT: call _Z13get_global_idj +; VENTUS-NEXT: vmv.v.x v1, s0 +; VENTUS-NEXT: vblt v0, v1, .LBB2_5 +; VENTUS-NEXT: # %bb.3: # %if.then2 +; VENTUS-NEXT: li a0, 23 +; VENTUS-NEXT: vmv.v.x v0, a0 +; VENTUS-NEXT: join v0, v0, .LBB2_6 +; VENTUS-NEXT: .LBB2_4: # %if.end7 +; VENTUS-NEXT: li a0, 4 +; VENTUS-NEXT: vmv.v.x v0, a0 +; VENTUS-NEXT: lw ra, -12(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw s0, -16(sp) # 4-byte Folded Reload +; VENTUS-NEXT: addi tp, tp, -16 +; VENTUS-NEXT: call _Z13get_global_idj +; VENTUS-NEXT: join v0, v0, .LBB2_8 +; VENTUS-NEXT: .LBB2_5: # %if.then2 +; VENTUS-NEXT: li a0, 12 +; VENTUS-NEXT: vmv.v.x v0, a0 +; VENTUS-NEXT: join v0, v0, .LBB2_6 +; VENTUS-NEXT: .LBB2_6: # %if.then2 +; VENTUS-NEXT: vmv.x.s a0, v0 +; VENTUS-NEXT: join v0, v0, .LBB2_7 +; VENTUS-NEXT: .LBB2_9: +; VENTUS-NEXT: join v0, v0, .LBB2_7 +; VENTUS-NEXT: .LBB2_7: # %cleanup9 +; VENTUS-NEXT: vmv.v.x v0, a0 +; VENTUS-NEXT: lw ra, -12(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw s0, -16(sp) # 4-byte Folded Reload +; VENTUS-NEXT: addi tp, tp, -16 +; VENTUS-NEXT: join v0, v0, .LBB2_8 +; VENTUS-NEXT: .LBB2_8: +; VENTUS-NEXT: ret +entry: + %call = tail call i32 @_Z13get_global_idj(i32 noundef 0) + %cmp = icmp slt i32 %call, 14 + br i1 %cmp, label %cleanup9, label %if.else + +if.else: ; preds = %entry + %cmp1 = icmp ult i32 %call, 18 + br i1 %cmp1, label %if.then2, label %if.end7 + +if.then2: ; preds = %if.else + %call3 = tail call i32 @_Z13get_global_idj(i32 noundef 1) + %cmp4 = icmp sgt i32 %call, %call3 + %. = select i1 %cmp4, i32 12, i32 23 + br label %cleanup9 + +if.end7: ; preds = %if.else + %call8 = tail call i32 @_Z13get_global_idj(i32 noundef 4) + br label %cleanup9 + +cleanup9: ; preds = %entry, %if.end7, %if.then2 + %retval.1 = phi i32 [ %., %if.then2 ], [ %call8, %if.end7 ], [ 13, %entry ] + ret i32 %retval.1 +} + +; Function Attrs: convergent nofree norecurse nounwind memory(argmem: readwrite) vscale_range(1,2048) +define dso_local spir_kernel void @double_loop(ptr addrspace(1) nocapture noundef align 4 %A, ptr addrspace(1) nocapture noundef readonly align 4 %B) { +; VENTUS-LABEL: double_loop: +; VENTUS: # %bb.0: # %entry +; VENTUS-NEXT: addi sp, sp, 16 +; VENTUS-NEXT: .cfi_def_cfa_offset 16 +; VENTUS-NEXT: sw ra, -12(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw s0, -16(sp) # 4-byte Folded Spill +; VENTUS-NEXT: .cfi_offset ra, 4 +; VENTUS-NEXT: .cfi_offset s0, 0 +; VENTUS-NEXT: mv s0, a0 +; VENTUS-NEXT: vmv.v.x v0, zero +; VENTUS-NEXT: call _Z13get_global_idj +; VENTUS-NEXT: vmv.x.s a0, v0 +; VENTUS-NEXT: vmv.v.x v0, zero +; VENTUS-NEXT: vmv.v.x v1, a0 +; VENTUS-NEXT: vbeq v1, v0, .LBB3_6 +; VENTUS-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph +; VENTUS-NEXT: li a1, 0 +; VENTUS-NEXT: lw a4, 4(s0) +; VENTUS-NEXT: lw a2, 0(s0) +; VENTUS-NEXT: slli a5, a0, 2 +; VENTUS-NEXT: add a2, a2, a5 +; VENTUS-NEXT: lw a3, 0(a2) +; VENTUS-NEXT: add a4, a4, a5 +; VENTUS-NEXT: .LBB3_2: # %for.cond1.preheader +; VENTUS-NEXT: # =>This Loop Header: Depth=1 +; VENTUS-NEXT: # Child Loop BB3_3 Depth 2 +; VENTUS-NEXT: mv a5, a0 +; VENTUS-NEXT: .LBB3_3: # %for.body4 +; VENTUS-NEXT: # Parent Loop BB3_2 Depth=1 +; VENTUS-NEXT: # => This Inner Loop Header: Depth=2 +; VENTUS-NEXT: lw a6, 0(a4) +; VENTUS-NEXT: add a3, a3, a6 +; VENTUS-NEXT: addi a5, a5, -1 +; VENTUS-NEXT: sw a3, 0(a2) +; VENTUS-NEXT: bnez a5, .LBB3_3 +; VENTUS-NEXT: # %bb.4: # %for.cond1.for.cond.cleanup3_crit_edge +; VENTUS-NEXT: # in Loop: Header=BB3_2 Depth=1 +; VENTUS-NEXT: addi a1, a1, 1 +; VENTUS-NEXT: bne a1, a0, .LBB3_2 +; VENTUS-NEXT: join v0, v0, .LBB3_5 +; VENTUS-NEXT: .LBB3_6: +; VENTUS-NEXT: join v0, v0, .LBB3_5 +; VENTUS-NEXT: .LBB3_5: # %for.cond.cleanup +; VENTUS-NEXT: lw ra, -12(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw s0, -16(sp) # 4-byte Folded Reload +; VENTUS-NEXT: addi sp, sp, -16 +; VENTUS-NEXT: ret +entry: + %call = tail call i32 @_Z13get_global_idj(i32 noundef 0) + %cmp16.not = icmp eq i32 %call, 0 + br i1 %cmp16.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph + +for.cond1.preheader.lr.ph: ; preds = %entry + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %B, i32 %call + %arrayidx5 = getelementptr inbounds i32, ptr addrspace(1) %A, i32 %call + %.pre.pre = load i32, ptr addrspace(1) %arrayidx5, align 4 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.cond1.for.cond.cleanup3_crit_edge + %.pre = phi i32 [ %.pre.pre, %for.cond1.preheader.lr.ph ], [ %add, %for.cond1.for.cond.cleanup3_crit_edge ] + %i.017 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc7, %for.cond1.for.cond.cleanup3_crit_edge ] + br label %for.body4 + +for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge, %entry + ret void + +for.cond1.for.cond.cleanup3_crit_edge: ; preds = %for.body4 + %inc7 = add nuw nsw i32 %i.017, 1 + %exitcond18.not = icmp eq i32 %inc7, %call + br i1 %exitcond18.not, label %for.cond.cleanup, label %for.cond1.preheader + +for.body4: ; preds = %for.cond1.preheader, %for.body4 + %0 = phi i32 [ %.pre, %for.cond1.preheader ], [ %add, %for.body4 ] + %j.015 = phi i32 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ] + %1 = load i32, ptr addrspace(1) %arrayidx, align 4 + %add = add nsw i32 %0, %1 + store i32 %add, ptr addrspace(1) %arrayidx5, align 4 + %inc = add nuw nsw i32 %j.015, 1 + %exitcond.not = icmp eq i32 %inc, %call + br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge, label %for.body4 +} + +; Function Attrs: convergent nofree norecurse nounwind memory(argmem: readwrite) vscale_range(1,2048) +define dso_local spir_kernel void @loop_switch(ptr addrspace(1) nocapture noundef align 4 %A, ptr addrspace(1) nocapture noundef readonly align 4 %B) { +; VENTUS-LABEL: loop_switch: +; VENTUS: # %bb.0: # %entry +; VENTUS-NEXT: addi sp, sp, 16 +; VENTUS-NEXT: .cfi_def_cfa_offset 16 +; VENTUS-NEXT: sw ra, -12(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw s0, -16(sp) # 4-byte Folded Spill +; VENTUS-NEXT: .cfi_offset ra, 4 +; VENTUS-NEXT: .cfi_offset s0, 0 +; VENTUS-NEXT: mv s0, a0 +; VENTUS-NEXT: vmv.v.x v0, zero +; VENTUS-NEXT: call _Z13get_global_idj +; VENTUS-NEXT: vmv.x.s a0, v0 +; VENTUS-NEXT: vmv.v.x v0, zero +; VENTUS-NEXT: vmv.v.x v1, a0 +; VENTUS-NEXT: vbeq v1, v0, .LBB4_10 +; VENTUS-NEXT: # %bb.1: # %for.body.lr.ph +; VENTUS-NEXT: li a1, 0 +; VENTUS-NEXT: lw a2, 4(s0) +; VENTUS-NEXT: lw a5, 0(s0) +; VENTUS-NEXT: slli a3, a0, 2 +; VENTUS-NEXT: add a2, a2, a3 +; VENTUS-NEXT: add a3, a5, a3 +; VENTUS-NEXT: addi a4, a5, 8 +; VENTUS-NEXT: addi a5, a5, 4 +; VENTUS-NEXT: li a6, 1 +; VENTUS-NEXT: li a7, 2 +; VENTUS-NEXT: j .LBB4_5 +; VENTUS-NEXT: .LBB4_2: # %sw.default +; VENTUS-NEXT: # in Loop: Header=BB4_5 Depth=1 +; VENTUS-NEXT: lw t1, 0(a2) +; VENTUS-NEXT: mv t0, a3 +; VENTUS-NEXT: .LBB4_3: # %for.inc.sink.split +; VENTUS-NEXT: # in Loop: Header=BB4_5 Depth=1 +; VENTUS-NEXT: lw t2, 0(t0) +; VENTUS-NEXT: add t1, t2, t1 +; VENTUS-NEXT: sw t1, 0(t0) +; VENTUS-NEXT: .LBB4_4: # %for.inc +; VENTUS-NEXT: # in Loop: Header=BB4_5 Depth=1 +; VENTUS-NEXT: addi a1, a1, 1 +; VENTUS-NEXT: beq a0, a1, .LBB4_9 +; VENTUS-NEXT: join v0, v0, .LBB4_9 +; VENTUS-NEXT: .LBB4_5: # %for.body +; VENTUS-NEXT: # =>This Inner Loop Header: Depth=1 +; VENTUS-NEXT: beqz a1, .LBB4_4 +; VENTUS-NEXT: # %bb.6: # %for.body +; VENTUS-NEXT: # in Loop: Header=BB4_5 Depth=1 +; VENTUS-NEXT: mv t0, a5 +; VENTUS-NEXT: li t1, 2 +; VENTUS-NEXT: beq a1, a6, .LBB4_3 +; VENTUS-NEXT: # %bb.7: # %for.body +; VENTUS-NEXT: # in Loop: Header=BB4_5 Depth=1 +; VENTUS-NEXT: bne a1, a7, .LBB4_2 +; VENTUS-NEXT: # %bb.8: # %sw.bb4 +; VENTUS-NEXT: # in Loop: Header=BB4_5 Depth=1 +; VENTUS-NEXT: li t1, 23 +; VENTUS-NEXT: mv t0, a4 +; VENTUS-NEXT: j .LBB4_3 +; VENTUS-NEXT: .LBB4_10: +; VENTUS-NEXT: join v0, v0, .LBB4_9 +; VENTUS-NEXT: .LBB4_9: # %for.cond.cleanup +; VENTUS-NEXT: lw ra, -12(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw s0, -16(sp) # 4-byte Folded Reload +; VENTUS-NEXT: addi sp, sp, -16 +; VENTUS-NEXT: ret +entry: + %call = tail call i32 @_Z13get_global_idj(i32 noundef 0) + %cmp21.not = icmp eq i32 %call, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %arrayidx7 = getelementptr inbounds i32, ptr addrspace(1) %B, i32 %call + %arrayidx8 = getelementptr inbounds i32, ptr addrspace(1) %A, i32 %call + %arrayidx5 = getelementptr inbounds i32, ptr addrspace(1) %A, i32 2 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %A, i32 1 + br label %for.body + +for.cond.cleanup: ; preds = %for.inc, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.inc + %i.022 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] + switch i32 %i.022, label %sw.default [ + i32 0, label %for.inc + i32 1, label %for.inc.sink.split + i32 2, label %sw.bb4 + ] + +sw.bb4: ; preds = %for.body + br label %for.inc.sink.split + +sw.default: ; preds = %for.body + %0 = load i32, ptr addrspace(1) %arrayidx7, align 4 + br label %for.inc.sink.split + +for.inc.sink.split: ; preds = %for.body, %sw.default, %sw.bb4 + %arrayidx2.sink24 = phi ptr addrspace(1) [ %arrayidx5, %sw.bb4 ], [ %arrayidx8, %sw.default ], [ %arrayidx2, %for.body ] + %.sink23 = phi i32 [ 23, %sw.bb4 ], [ %0, %sw.default ], [ 2, %for.body ] + %1 = load i32, ptr addrspace(1) %arrayidx2.sink24, align 4 + %add3 = add nsw i32 %1, %.sink23 + store i32 %add3, ptr addrspace(1) %arrayidx2.sink24, align 4 + br label %for.inc + +for.inc: ; preds = %for.inc.sink.split, %for.body + %inc = add nuw nsw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %call + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none) +declare dso_local i32 @_Z13get_global_idj(i32 noundef) local_unnamed_addr +