forked from OSchip/llvm-project
[AMDGPU] Combine s_or_saveexec, s_xor instructions.
This patch merges a consecutive sequence of s_or_saveexec s_o, s_i s_xor exec, exec, s_o into a single s_andn2_saveexec s_o, s_i instruction. This patch also cleans up the SIOptimizeExecMasking pass a bit. Reviewed By: nhaehnle Differential Revision: https://reviews.llvm.org/D129073
This commit is contained in:
parent
65c8e24622
commit
fd64a857ee
|
@ -12,6 +12,8 @@
|
|||
#include "SIRegisterInfo.h"
|
||||
#include "llvm/CodeGen/LivePhysRegs.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineOperand.h"
|
||||
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
@ -26,6 +28,10 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
|
|||
const SIRegisterInfo *TRI = nullptr;
|
||||
const SIInstrInfo *TII = nullptr;
|
||||
const MachineRegisterInfo *MRI = nullptr;
|
||||
MCRegister Exec;
|
||||
|
||||
DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
|
||||
SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
|
||||
|
||||
Register isCopyFromExec(const MachineInstr &MI) const;
|
||||
Register isCopyToExec(const MachineInstr &MI) const;
|
||||
|
@ -44,13 +50,13 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
|
|||
std::function<bool(MachineInstr *)> Pred,
|
||||
ArrayRef<MCRegister> NonModifiableRegs,
|
||||
unsigned MaxInstructions = 20) const;
|
||||
MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec,
|
||||
MCRegister Exec) const;
|
||||
bool optimizeExecSequence() const;
|
||||
bool optimizeVCmpxAndSaveexecSequence() const;
|
||||
bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
|
||||
MachineInstr &VCmp,
|
||||
MCRegister Exec) const;
|
||||
bool optimizeExecSequence();
|
||||
void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
|
||||
bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
|
||||
MachineInstr &VCmp, MCRegister Exec) const;
|
||||
|
||||
void tryRecordOrSaveexecXorSequence(MachineInstr &MI);
|
||||
bool optimizeOrSaveexecXorSequences();
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
@ -92,7 +98,7 @@ Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const {
|
|||
case AMDGPU::S_MOV_B32:
|
||||
case AMDGPU::S_MOV_B32_term: {
|
||||
const MachineOperand &Src = MI.getOperand(1);
|
||||
if (Src.isReg() && Src.getReg() == TRI->getExec())
|
||||
if (Src.isReg() && Src.getReg() == Exec)
|
||||
return MI.getOperand(0).getReg();
|
||||
}
|
||||
}
|
||||
|
@ -107,8 +113,7 @@ Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
|
|||
case AMDGPU::S_MOV_B64:
|
||||
case AMDGPU::S_MOV_B32: {
|
||||
const MachineOperand &Dst = MI.getOperand(0);
|
||||
if (Dst.isReg() && Dst.getReg() == TRI->getExec() &&
|
||||
MI.getOperand(1).isReg())
|
||||
if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg())
|
||||
return MI.getOperand(1).getReg();
|
||||
break;
|
||||
}
|
||||
|
@ -394,9 +399,7 @@ bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop,
|
|||
// =>
|
||||
// x = s_<op>_saveexec_b64 y
|
||||
//
|
||||
bool SIOptimizeExecMasking::optimizeExecSequence() const {
|
||||
MCRegister Exec = TRI->getExec();
|
||||
|
||||
bool SIOptimizeExecMasking::optimizeExecSequence() {
|
||||
bool Changed = false;
|
||||
for (MachineBasicBlock &MBB : *MF) {
|
||||
MachineBasicBlock::reverse_iterator I = fixTerminators(MBB);
|
||||
|
@ -551,88 +554,9 @@ bool SIOptimizeExecMasking::optimizeExecSequence() const {
|
|||
return Changed;
|
||||
}
|
||||
|
||||
// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
|
||||
// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
|
||||
// to the v_cmp instruction if it is safe to replace the sequence (see the
|
||||
// conditions in the function body). This is after register allocation, so some
|
||||
// checks on operand dependencies need to be considered.
|
||||
MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization(
|
||||
MachineInstr &SaveExec, MCRegister Exec) const {
|
||||
|
||||
MachineInstr *VCmp = nullptr;
|
||||
|
||||
Register SaveExecDest = SaveExec.getOperand(0).getReg();
|
||||
if (!TRI->isSGPRReg(*MRI, SaveExecDest))
|
||||
return nullptr;
|
||||
|
||||
MachineOperand *SaveExecSrc0 =
|
||||
TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
|
||||
if (!SaveExecSrc0->isReg())
|
||||
return nullptr;
|
||||
|
||||
// Try to find the last v_cmp instruction that defs the saveexec input
|
||||
// operand without any write to Exec or the saveexec input operand inbetween.
|
||||
VCmp = findInstrBackwards(
|
||||
SaveExec,
|
||||
[&](MachineInstr *Check) {
|
||||
return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
|
||||
Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
|
||||
},
|
||||
{Exec, SaveExecSrc0->getReg()});
|
||||
|
||||
if (!VCmp)
|
||||
return nullptr;
|
||||
|
||||
MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
|
||||
assert(VCmpDest && "Should have an sdst operand!");
|
||||
|
||||
// Check if any of the v_cmp source operands is written by the saveexec.
|
||||
MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
|
||||
if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
|
||||
SaveExec.modifiesRegister(Src0->getReg(), TRI))
|
||||
return nullptr;
|
||||
|
||||
MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
|
||||
if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
|
||||
SaveExec.modifiesRegister(Src1->getReg(), TRI))
|
||||
return nullptr;
|
||||
|
||||
// Don't do the transformation if the destination operand is included in
|
||||
// it's MBB Live-outs, meaning it's used in any of it's successors, leading
|
||||
// to incorrect code if the v_cmp and therefore the def of
|
||||
// the dest operand is removed.
|
||||
if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
|
||||
return nullptr;
|
||||
|
||||
// If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
|
||||
// s_and_saveexec, skip the optimization.
|
||||
if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false,
|
||||
true) ||
|
||||
isRegisterInUseAfter(SaveExec, VCmpDest->getReg()))
|
||||
return nullptr;
|
||||
|
||||
// Try to determine if there is a write to any of the VCmp
|
||||
// operands between the saveexec and the vcmp.
|
||||
// If yes, additional VGPR spilling might need to be inserted. In this case,
|
||||
// it's not worth replacing the instruction sequence.
|
||||
SmallVector<MCRegister, 2> NonDefRegs;
|
||||
if (Src0->isReg())
|
||||
NonDefRegs.push_back(Src0->getReg());
|
||||
|
||||
if (Src1->isReg())
|
||||
NonDefRegs.push_back(Src1->getReg());
|
||||
|
||||
if (!findInstrBackwards(
|
||||
SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
|
||||
NonDefRegs))
|
||||
return nullptr;
|
||||
|
||||
return VCmp;
|
||||
}
|
||||
|
||||
// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
|
||||
// operands extracted from a v_cmp ..., s_and_saveexec pattern.
|
||||
bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence(
|
||||
bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
|
||||
MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const {
|
||||
const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
|
||||
|
||||
|
@ -678,50 +602,164 @@ bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence(
|
|||
if (Src1->isReg())
|
||||
MRI->clearKillFlags(Src1->getReg());
|
||||
|
||||
SaveExecInstr.eraseFromParent();
|
||||
VCmp.eraseFromParent();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// After all s_op_saveexec instructions are inserted,
|
||||
// replace (on GFX10.3 and later)
|
||||
// Record (on GFX10.3 and later) occurences of
|
||||
// v_cmp_* SGPR, IMM, VGPR
|
||||
// s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
|
||||
// with
|
||||
// to be replaced with
|
||||
// s_mov_b32 EXEC_SGPR_DEST, exec_lo
|
||||
// v_cmpx_* IMM, VGPR
|
||||
// to reduce pipeline stalls.
|
||||
bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const {
|
||||
void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
|
||||
MachineInstr &MI) {
|
||||
if (!ST->hasGFX10_3Insts())
|
||||
return false;
|
||||
return;
|
||||
|
||||
bool Changed = false;
|
||||
|
||||
DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
|
||||
MCRegister Exec = TRI->getExec();
|
||||
const unsigned AndSaveExecOpcode =
|
||||
ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
|
||||
|
||||
for (MachineBasicBlock &MBB : *MF) {
|
||||
for (MachineInstr &MI : MBB) {
|
||||
// Record relevant v_cmp / s_and_saveexec instruction pairs for
|
||||
// replacement.
|
||||
if (MI.getOpcode() != AndSaveExecOpcode)
|
||||
continue;
|
||||
if (MI.getOpcode() != AndSaveExecOpcode)
|
||||
return;
|
||||
|
||||
if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec))
|
||||
SaveExecVCmpMapping[&MI] = VCmp;
|
||||
Register SaveExecDest = MI.getOperand(0).getReg();
|
||||
if (!TRI->isSGPRReg(*MRI, SaveExecDest))
|
||||
return;
|
||||
|
||||
MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
|
||||
if (!SaveExecSrc0->isReg())
|
||||
return;
|
||||
|
||||
// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec
|
||||
// sequence by looking at an instance of a s_and_saveexec instruction. Returns
|
||||
// a pointer to the v_cmp instruction if it is safe to replace the sequence
|
||||
// (see the conditions in the function body). This is after register
|
||||
// allocation, so some checks on operand dependencies need to be considered.
|
||||
MachineInstr *VCmp = nullptr;
|
||||
|
||||
// Try to find the last v_cmp instruction that defs the saveexec input
|
||||
// operand without any write to Exec or the saveexec input operand inbetween.
|
||||
VCmp = findInstrBackwards(
|
||||
MI,
|
||||
[&](MachineInstr *Check) {
|
||||
return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
|
||||
Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
|
||||
},
|
||||
{Exec, SaveExecSrc0->getReg()});
|
||||
|
||||
if (!VCmp)
|
||||
return;
|
||||
|
||||
MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
|
||||
assert(VCmpDest && "Should have an sdst operand!");
|
||||
|
||||
// Check if any of the v_cmp source operands is written by the saveexec.
|
||||
MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
|
||||
if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
|
||||
MI.modifiesRegister(Src0->getReg(), TRI))
|
||||
return;
|
||||
|
||||
MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
|
||||
if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
|
||||
MI.modifiesRegister(Src1->getReg(), TRI))
|
||||
return;
|
||||
|
||||
// Don't do the transformation if the destination operand is included in
|
||||
// it's MBB Live-outs, meaning it's used in any of it's successors, leading
|
||||
// to incorrect code if the v_cmp and therefore the def of
|
||||
// the dest operand is removed.
|
||||
if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
|
||||
return;
|
||||
|
||||
// If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
|
||||
// s_and_saveexec, skip the optimization.
|
||||
if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) ||
|
||||
isRegisterInUseAfter(MI, VCmpDest->getReg()))
|
||||
return;
|
||||
|
||||
// Try to determine if there is a write to any of the VCmp
|
||||
// operands between the saveexec and the vcmp.
|
||||
// If yes, additional VGPR spilling might need to be inserted. In this case,
|
||||
// it's not worth replacing the instruction sequence.
|
||||
SmallVector<MCRegister, 2> NonDefRegs;
|
||||
if (Src0->isReg())
|
||||
NonDefRegs.push_back(Src0->getReg());
|
||||
|
||||
if (Src1->isReg())
|
||||
NonDefRegs.push_back(Src1->getReg());
|
||||
|
||||
if (!findInstrBackwards(
|
||||
MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs))
|
||||
return;
|
||||
|
||||
if (VCmp)
|
||||
SaveExecVCmpMapping[&MI] = VCmp;
|
||||
}
|
||||
|
||||
// Record occurences of
|
||||
// s_or_saveexec s_o, s_i
|
||||
// s_xor exec, exec, s_o
|
||||
// to be replaced with
|
||||
// s_andn2_saveexec s_o, s_i.
|
||||
void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) {
|
||||
const unsigned XorOpcode =
|
||||
ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
|
||||
|
||||
if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) {
|
||||
const MachineOperand &XorDst = MI.getOperand(0);
|
||||
const MachineOperand &XorSrc0 = MI.getOperand(1);
|
||||
const MachineOperand &XorSrc1 = MI.getOperand(2);
|
||||
|
||||
if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() &&
|
||||
XorSrc1.isReg() &&
|
||||
(XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) {
|
||||
const unsigned OrSaveexecOpcode = ST->isWave32()
|
||||
? AMDGPU::S_OR_SAVEEXEC_B32
|
||||
: AMDGPU::S_OR_SAVEEXEC_B64;
|
||||
|
||||
// Peek at the previous instruction and check if this is a relevant
|
||||
// s_or_saveexec instruction.
|
||||
MachineInstr &PossibleOrSaveexec = *MI.getPrevNode();
|
||||
if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode)
|
||||
return;
|
||||
|
||||
const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0);
|
||||
const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1);
|
||||
if (OrDst.isReg() && OrSrc0.isReg()) {
|
||||
if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) ||
|
||||
(XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) {
|
||||
OrXors.emplace_back(&PossibleOrSaveexec, &MI);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto &Entry : SaveExecVCmpMapping) {
|
||||
MachineInstr *SaveExecInstr = Entry.getFirst();
|
||||
MachineInstr *VCmpInstr = Entry.getSecond();
|
||||
bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
|
||||
if (OrXors.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) {
|
||||
SaveExecInstr->eraseFromParent();
|
||||
VCmpInstr->eraseFromParent();
|
||||
bool Changed = false;
|
||||
const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32
|
||||
: AMDGPU::S_ANDN2_SAVEEXEC_B64;
|
||||
|
||||
Changed = true;
|
||||
}
|
||||
for (const auto &Pair : OrXors) {
|
||||
MachineInstr *Or = nullptr;
|
||||
MachineInstr *Xor = nullptr;
|
||||
std::tie(Or, Xor) = Pair;
|
||||
BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(),
|
||||
TII->get(Andn2Opcode), Or->getOperand(0).getReg())
|
||||
.addReg(Or->getOperand(1).getReg());
|
||||
|
||||
Or->eraseFromParent();
|
||||
Xor->eraseFromParent();
|
||||
|
||||
Changed = true;
|
||||
}
|
||||
|
||||
return Changed;
|
||||
|
@ -736,9 +774,42 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
|
|||
TRI = ST->getRegisterInfo();
|
||||
TII = ST->getInstrInfo();
|
||||
MRI = &MF.getRegInfo();
|
||||
Exec = TRI->getExec();
|
||||
|
||||
bool Changed = optimizeExecSequence();
|
||||
Changed |= optimizeVCmpxAndSaveexecSequence();
|
||||
|
||||
OrXors.clear();
|
||||
SaveExecVCmpMapping.clear();
|
||||
static unsigned SearchWindow = 10;
|
||||
for (MachineBasicBlock &MBB : MF) {
|
||||
unsigned SearchCount = 0;
|
||||
|
||||
for (auto &MI : llvm::reverse(MBB)) {
|
||||
if (MI.isDebugInstr())
|
||||
continue;
|
||||
|
||||
if (SearchCount >= SearchWindow) {
|
||||
break;
|
||||
}
|
||||
|
||||
tryRecordOrSaveexecXorSequence(MI);
|
||||
tryRecordVCmpxAndSaveexecSequence(MI);
|
||||
|
||||
if (MI.modifiesRegister(Exec, TRI)) {
|
||||
break;
|
||||
}
|
||||
|
||||
++SearchCount;
|
||||
}
|
||||
}
|
||||
|
||||
Changed |= optimizeOrSaveexecXorSequences();
|
||||
for (const auto &Entry : SaveExecVCmpMapping) {
|
||||
MachineInstr *SaveExecInstr = Entry.getFirst();
|
||||
MachineInstr *VCmpInstr = Entry.getSecond();
|
||||
|
||||
Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec);
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
|
|
@ -32,8 +32,7 @@ define amdgpu_cs void @memmove_p1i8(i8 addrspace(1)* %dst, i8 addrspace(1)* %src
|
|||
; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64
|
||||
; LOOP-NEXT: s_cbranch_vccnz .LBB0_2
|
||||
; LOOP-NEXT: .LBB0_3: ; %Flow14
|
||||
; LOOP-NEXT: s_or_saveexec_b64 s[0:1], s[4:5]
|
||||
; LOOP-NEXT: s_xor_b64 exec, exec, s[0:1]
|
||||
; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[4:5]
|
||||
; LOOP-NEXT: s_cbranch_execz .LBB0_6
|
||||
; LOOP-NEXT: ; %bb.4: ; %copy_backwards
|
||||
; LOOP-NEXT: s_mov_b64 s[4:5], 3
|
||||
|
|
|
@ -173,8 +173,14 @@ define void @localize_internal_globals(i1 %cond) {
|
|||
; GFX9-NEXT: s_xor_b64 s[4:5], vcc, -1
|
||||
; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
|
||||
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
|
||||
; GFX9-NEXT: s_cbranch_execz .LBB2_2
|
||||
; GFX9-NEXT: ; %bb.1: ; %bb1
|
||||
; GFX9-NEXT: s_cbranch_execnz .LBB2_3
|
||||
; GFX9-NEXT: ; %bb.1: ; %Flow
|
||||
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
||||
; GFX9-NEXT: s_cbranch_execnz .LBB2_4
|
||||
; GFX9-NEXT: .LBB2_2: ; %bb2
|
||||
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX9-NEXT: .LBB2_3: ; %bb1
|
||||
; GFX9-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX9-NEXT: s_add_u32 s6, s6, static.gv2@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s7, s7, static.gv2@rel32@hi+12
|
||||
|
@ -187,11 +193,9 @@ define void @localize_internal_globals(i1 %cond) {
|
|||
; GFX9-NEXT: v_mov_b32_e32 v1, 1
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: .LBB2_2: ; %Flow
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
|
||||
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; GFX9-NEXT: s_cbranch_execz .LBB2_4
|
||||
; GFX9-NEXT: ; %bb.3: ; %bb0
|
||||
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
||||
; GFX9-NEXT: s_cbranch_execz .LBB2_2
|
||||
; GFX9-NEXT: .LBB2_4: ; %bb0
|
||||
; GFX9-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX9-NEXT: s_add_u32 s6, s6, static.gv0@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s7, s7, static.gv0@rel32@hi+12
|
||||
|
@ -204,9 +208,7 @@ define void @localize_internal_globals(i1 %cond) {
|
|||
; GFX9-NEXT: v_mov_b32_e32 v1, 1
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: .LBB2_4: ; %bb2
|
||||
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
br i1 %cond, label %bb0, label %bb1
|
||||
|
|
|
@ -16,8 +16,14 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
|
|||
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_2
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB0_3
|
||||
; CHECK-NEXT: ; %bb.1: ; %Flow
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB0_4
|
||||
; CHECK-NEXT: .LBB0_2:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
; CHECK-NEXT: .LBB0_3:
|
||||
; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3
|
||||
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0
|
||||
; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc
|
||||
|
@ -151,11 +157,9 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
|
|||
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr2
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr4
|
||||
; CHECK-NEXT: .LBB0_2: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_4
|
||||
; CHECK-NEXT: ; %bb.3:
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_2
|
||||
; CHECK-NEXT: .LBB0_4:
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2
|
||||
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
|
||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
|
@ -176,7 +180,6 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
|
|||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: .LBB0_4:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = sdiv i64 %num, %den
|
||||
|
@ -787,8 +790,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: ; implicit-def: $vgpr4
|
||||
; CGP-NEXT: ; implicit-def: $vgpr10
|
||||
; CGP-NEXT: .LBB2_2: ; %Flow1
|
||||
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_4
|
||||
; CGP-NEXT: ; %bb.3:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4
|
||||
|
@ -819,8 +821,14 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_6
|
||||
; CGP-NEXT: ; %bb.5:
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB2_7
|
||||
; CGP-NEXT: ; %bb.5: ; %Flow
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB2_8
|
||||
; CGP-NEXT: .LBB2_6:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
; CGP-NEXT: .LBB2_7:
|
||||
; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7
|
||||
; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2
|
||||
; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc
|
||||
|
@ -954,11 +962,9 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
|
||||
; CGP-NEXT: ; implicit-def: $vgpr6
|
||||
; CGP-NEXT: ; implicit-def: $vgpr8
|
||||
; CGP-NEXT: .LBB2_6: ; %Flow
|
||||
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_8
|
||||
; CGP-NEXT: ; %bb.7:
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_6
|
||||
; CGP-NEXT: .LBB2_8:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6
|
||||
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6
|
||||
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
||||
|
@ -979,7 +985,6 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6
|
||||
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
|
||||
; CGP-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CGP-NEXT: .LBB2_8:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = sdiv <2 x i64> %num, %den
|
||||
|
@ -2328,8 +2333,14 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_2
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB7_3
|
||||
; CHECK-NEXT: ; %bb.1: ; %Flow
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB7_4
|
||||
; CHECK-NEXT: .LBB7_2:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
; CHECK-NEXT: .LBB7_3:
|
||||
; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6
|
||||
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0
|
||||
; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc
|
||||
|
@ -2463,11 +2474,9 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr3
|
||||
; CHECK-NEXT: .LBB7_2: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_4
|
||||
; CHECK-NEXT: ; %bb.3:
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_2
|
||||
; CHECK-NEXT: .LBB7_4:
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5
|
||||
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5
|
||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
|
@ -2488,7 +2497,6 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: .LBB7_4:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%shl.y = shl i64 4096, %y
|
||||
|
@ -2953,8 +2961,14 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB8_6
|
||||
; CGP-NEXT: ; %bb.5:
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB8_7
|
||||
; CGP-NEXT: ; %bb.5: ; %Flow
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB8_8
|
||||
; CGP-NEXT: .LBB8_6:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
; CGP-NEXT: .LBB8_7:
|
||||
; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10
|
||||
; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2
|
||||
; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v2, vcc
|
||||
|
@ -3088,11 +3102,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
|
||||
; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10
|
||||
; CGP-NEXT: ; implicit-def: $vgpr5
|
||||
; CGP-NEXT: .LBB8_6: ; %Flow
|
||||
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB8_8
|
||||
; CGP-NEXT: ; %bb.7:
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB8_6
|
||||
; CGP-NEXT: .LBB8_8:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9
|
||||
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9
|
||||
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
||||
|
@ -3113,7 +3125,6 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9
|
||||
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
|
||||
; CGP-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CGP-NEXT: .LBB8_8:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
%shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
|
||||
|
|
|
@ -16,8 +16,14 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
|
|||
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_2
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB0_3
|
||||
; CHECK-NEXT: ; %bb.1: ; %Flow
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB0_4
|
||||
; CHECK-NEXT: .LBB0_2:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
; CHECK-NEXT: .LBB0_3:
|
||||
; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3
|
||||
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0
|
||||
; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc
|
||||
|
@ -149,11 +155,9 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
|
|||
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr2
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr4
|
||||
; CHECK-NEXT: .LBB0_2: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_4
|
||||
; CHECK-NEXT: ; %bb.3:
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_2
|
||||
; CHECK-NEXT: .LBB0_4:
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2
|
||||
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
|
||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
|
@ -172,7 +176,6 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
|
|||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: .LBB0_4:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = srem i64 %num, %den
|
||||
|
@ -775,8 +778,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: ; implicit-def: $vgpr4
|
||||
; CGP-NEXT: ; implicit-def: $vgpr10
|
||||
; CGP-NEXT: .LBB2_2: ; %Flow1
|
||||
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_4
|
||||
; CGP-NEXT: ; %bb.3:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4
|
||||
|
@ -805,8 +807,14 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_6
|
||||
; CGP-NEXT: ; %bb.5:
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB2_7
|
||||
; CGP-NEXT: ; %bb.5: ; %Flow
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB2_8
|
||||
; CGP-NEXT: .LBB2_6:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
; CGP-NEXT: .LBB2_7:
|
||||
; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7
|
||||
; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2
|
||||
; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc
|
||||
|
@ -938,11 +946,9 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
|
||||
; CGP-NEXT: ; implicit-def: $vgpr6
|
||||
; CGP-NEXT: ; implicit-def: $vgpr8
|
||||
; CGP-NEXT: .LBB2_6: ; %Flow
|
||||
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_8
|
||||
; CGP-NEXT: ; %bb.7:
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_6
|
||||
; CGP-NEXT: .LBB2_8:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6
|
||||
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6
|
||||
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
||||
|
@ -961,7 +967,6 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
|
||||
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
|
||||
; CGP-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CGP-NEXT: .LBB2_8:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = srem <2 x i64> %num, %den
|
||||
|
@ -2294,8 +2299,14 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_2
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB7_3
|
||||
; CHECK-NEXT: ; %bb.1: ; %Flow
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB7_4
|
||||
; CHECK-NEXT: .LBB7_2:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
; CHECK-NEXT: .LBB7_3:
|
||||
; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6
|
||||
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0
|
||||
; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc
|
||||
|
@ -2427,11 +2438,9 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr3
|
||||
; CHECK-NEXT: .LBB7_2: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_4
|
||||
; CHECK-NEXT: ; %bb.3:
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_2
|
||||
; CHECK-NEXT: .LBB7_4:
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5
|
||||
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5
|
||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
|
@ -2450,7 +2459,6 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: .LBB7_4:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%shl.y = shl i64 4096, %y
|
||||
|
@ -2910,8 +2918,14 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB8_6
|
||||
; CGP-NEXT: ; %bb.5:
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB8_7
|
||||
; CGP-NEXT: ; %bb.5: ; %Flow
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB8_8
|
||||
; CGP-NEXT: .LBB8_6:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
; CGP-NEXT: .LBB8_7:
|
||||
; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10
|
||||
; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2
|
||||
; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v2, vcc
|
||||
|
@ -3043,11 +3057,9 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
|
||||
; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10
|
||||
; CGP-NEXT: ; implicit-def: $vgpr5
|
||||
; CGP-NEXT: .LBB8_6: ; %Flow
|
||||
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB8_8
|
||||
; CGP-NEXT: ; %bb.7:
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB8_6
|
||||
; CGP-NEXT: .LBB8_8:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9
|
||||
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9
|
||||
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
||||
|
@ -3066,7 +3078,6 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9
|
||||
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
|
||||
; CGP-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CGP-NEXT: .LBB8_8:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
%shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
|
||||
|
|
|
@ -16,8 +16,14 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
|
|||
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_2
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB0_3
|
||||
; CHECK-NEXT: ; %bb.1: ; %Flow
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB0_4
|
||||
; CHECK-NEXT: .LBB0_2:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
; CHECK-NEXT: .LBB0_3:
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v3
|
||||
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
|
||||
|
@ -145,11 +151,9 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
|
|||
; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr2
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr4
|
||||
; CHECK-NEXT: .LBB0_2: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_4
|
||||
; CHECK-NEXT: ; %bb.3:
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_2
|
||||
; CHECK-NEXT: .LBB0_4:
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2
|
||||
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
|
||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
|
@ -170,7 +174,6 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
|
|||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: .LBB0_4:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = udiv i64 %num, %den
|
||||
|
@ -757,8 +760,7 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: ; implicit-def: $vgpr4
|
||||
; CGP-NEXT: ; implicit-def: $vgpr10
|
||||
; CGP-NEXT: .LBB2_2: ; %Flow1
|
||||
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_4
|
||||
; CGP-NEXT: ; %bb.3:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4
|
||||
|
@ -789,8 +791,14 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_6
|
||||
; CGP-NEXT: ; %bb.5:
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB2_7
|
||||
; CGP-NEXT: ; %bb.5: ; %Flow
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB2_8
|
||||
; CGP-NEXT: .LBB2_6:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
; CGP-NEXT: .LBB2_7:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v7
|
||||
; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6
|
||||
|
@ -918,11 +926,9 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
|
||||
; CGP-NEXT: ; implicit-def: $vgpr6
|
||||
; CGP-NEXT: ; implicit-def: $vgpr8
|
||||
; CGP-NEXT: .LBB2_6: ; %Flow
|
||||
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_8
|
||||
; CGP-NEXT: ; %bb.7:
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_6
|
||||
; CGP-NEXT: .LBB2_8:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6
|
||||
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6
|
||||
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
||||
|
@ -943,7 +949,6 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6
|
||||
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
|
||||
; CGP-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CGP-NEXT: .LBB2_8:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = udiv <2 x i64> %num, %den
|
||||
|
@ -1073,8 +1078,14 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_2
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB7_3
|
||||
; CHECK-NEXT: ; %bb.1: ; %Flow
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB7_4
|
||||
; CHECK-NEXT: .LBB7_2:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
; CHECK-NEXT: .LBB7_3:
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6
|
||||
; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5
|
||||
|
@ -1202,11 +1213,9 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr3
|
||||
; CHECK-NEXT: .LBB7_2: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_4
|
||||
; CHECK-NEXT: ; %bb.3:
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_2
|
||||
; CHECK-NEXT: .LBB7_4:
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5
|
||||
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5
|
||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
|
@ -1227,7 +1236,6 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: .LBB7_4:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%shl.y = shl i64 4096, %y
|
||||
|
@ -1672,8 +1680,14 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB8_6
|
||||
; CGP-NEXT: ; %bb.5:
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB8_7
|
||||
; CGP-NEXT: ; %bb.5: ; %Flow
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB8_8
|
||||
; CGP-NEXT: .LBB8_6:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
; CGP-NEXT: .LBB8_7:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v10
|
||||
; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v9
|
||||
|
@ -1801,11 +1815,9 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
|
||||
; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10
|
||||
; CGP-NEXT: ; implicit-def: $vgpr5
|
||||
; CGP-NEXT: .LBB8_6: ; %Flow
|
||||
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB8_8
|
||||
; CGP-NEXT: ; %bb.7:
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB8_6
|
||||
; CGP-NEXT: .LBB8_8:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9
|
||||
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9
|
||||
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
||||
|
@ -1826,7 +1838,6 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9
|
||||
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
|
||||
; CGP-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CGP-NEXT: .LBB8_8:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
%shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
|
||||
|
|
|
@ -16,8 +16,14 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
|
|||
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_2
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB0_3
|
||||
; CHECK-NEXT: ; %bb.1: ; %Flow
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB0_4
|
||||
; CHECK-NEXT: .LBB0_2:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
; CHECK-NEXT: .LBB0_3:
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v3
|
||||
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
|
||||
|
@ -144,11 +150,9 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
|
|||
; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr2
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr4
|
||||
; CHECK-NEXT: .LBB0_2: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_4
|
||||
; CHECK-NEXT: ; %bb.3:
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_2
|
||||
; CHECK-NEXT: .LBB0_4:
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2
|
||||
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
|
||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
|
@ -167,7 +171,6 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
|
|||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: .LBB0_4:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = urem i64 %num, %den
|
||||
|
@ -748,8 +751,7 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: ; implicit-def: $vgpr4
|
||||
; CGP-NEXT: ; implicit-def: $vgpr10
|
||||
; CGP-NEXT: .LBB2_2: ; %Flow1
|
||||
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_4
|
||||
; CGP-NEXT: ; %bb.3:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4
|
||||
|
@ -778,8 +780,14 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_6
|
||||
; CGP-NEXT: ; %bb.5:
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB2_7
|
||||
; CGP-NEXT: ; %bb.5: ; %Flow
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB2_8
|
||||
; CGP-NEXT: .LBB2_6:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
; CGP-NEXT: .LBB2_7:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v7
|
||||
; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6
|
||||
|
@ -906,11 +914,9 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc
|
||||
; CGP-NEXT: ; implicit-def: $vgpr6
|
||||
; CGP-NEXT: ; implicit-def: $vgpr8
|
||||
; CGP-NEXT: .LBB2_6: ; %Flow
|
||||
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_8
|
||||
; CGP-NEXT: ; %bb.7:
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB2_6
|
||||
; CGP-NEXT: .LBB2_8:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6
|
||||
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6
|
||||
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
||||
|
@ -929,7 +935,6 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
|
|||
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
|
||||
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
|
||||
; CGP-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CGP-NEXT: .LBB2_8:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = urem <2 x i64> %num, %den
|
||||
|
@ -1612,8 +1617,14 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_2
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB7_3
|
||||
; CHECK-NEXT: ; %bb.1: ; %Flow
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB7_4
|
||||
; CHECK-NEXT: .LBB7_2:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
; CHECK-NEXT: .LBB7_3:
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6
|
||||
; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5
|
||||
|
@ -1740,11 +1751,9 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr3
|
||||
; CHECK-NEXT: .LBB7_2: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_4
|
||||
; CHECK-NEXT: ; %bb.3:
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_2
|
||||
; CHECK-NEXT: .LBB7_4:
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5
|
||||
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5
|
||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
|
@ -1763,7 +1772,6 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: .LBB7_4:
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%shl.y = shl i64 4096, %y
|
||||
|
@ -2203,8 +2211,14 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB8_6
|
||||
; CGP-NEXT: ; %bb.5:
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB8_7
|
||||
; CGP-NEXT: ; %bb.5: ; %Flow
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execnz .LBB8_8
|
||||
; CGP-NEXT: .LBB8_6:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
; CGP-NEXT: .LBB8_7:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v10
|
||||
; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v9
|
||||
|
@ -2331,11 +2345,9 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc
|
||||
; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10
|
||||
; CGP-NEXT: ; implicit-def: $vgpr5
|
||||
; CGP-NEXT: .LBB8_6: ; %Flow
|
||||
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB8_8
|
||||
; CGP-NEXT: ; %bb.7:
|
||||
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; CGP-NEXT: s_cbranch_execz .LBB8_6
|
||||
; CGP-NEXT: .LBB8_8:
|
||||
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9
|
||||
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9
|
||||
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
||||
|
@ -2354,7 +2366,6 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9
|
||||
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
|
||||
; CGP-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CGP-NEXT: .LBB8_8:
|
||||
; CGP-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CGP-NEXT: s_setpc_b64 s[30:31]
|
||||
%shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
|
||||
|
|
|
@ -431,9 +431,9 @@ endif:
|
|||
; GCN-NEXT: s_and_saveexec_b64 [[TEMP_MASK:s\[[0-9]+:[0-9]+\]]], vcc
|
||||
; GCN-NEXT: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[TEMP_MASK]]
|
||||
|
||||
; GCN: BB{{[0-9]+_[0-9]+}}: ; %Flow
|
||||
; GCN-NEXT: s_or_saveexec_b64 [[TEMP_MASK1:s\[[0-9]+:[0-9]+\]]], [[MASK]]
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, [[TEMP_MASK1]]
|
||||
; GCN: .LBB{{[0-9]+_[0-9]+}}: ; %Flow1
|
||||
; GCN-NEXT: s_andn2_saveexec_b64 [[MASK]], [[MASK]]
|
||||
; GCN-NEXT: s_cbranch_execnz
|
||||
|
||||
; GCN: .L[[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop{{$}}
|
||||
; GCN: ;;#ASMSTART
|
||||
|
|
|
@ -123,8 +123,7 @@ define i64 @sdiv64(i64 %a, i64 %b) {
|
|||
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX9-NEXT: .LBB0_2: ; %Flow
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
|
||||
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; GFX9-NEXT: s_cbranch_execz .LBB0_4
|
||||
; GFX9-NEXT: ; %bb.3:
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2
|
||||
|
@ -260,8 +259,7 @@ define i64 @udiv64(i64 %a, i64 %b) {
|
|||
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX9-NEXT: .LBB1_2: ; %Flow
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
|
||||
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
|
||||
; GFX9-NEXT: s_cbranch_execz .LBB1_4
|
||||
; GFX9-NEXT: ; %bb.3:
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2
|
||||
|
@ -410,8 +408,7 @@ define i64 @srem64(i64 %a, i64 %b) {
|
|||
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX9-NEXT: .LBB2_2: ; %Flow
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9]
|
||||
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9]
|
||||
; GFX9-NEXT: s_cbranch_execz .LBB2_4
|
||||
; GFX9-NEXT: ; %bb.3:
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2
|
||||
|
@ -544,8 +541,7 @@ define i64 @urem64(i64 %a, i64 %b) {
|
|||
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX9-NEXT: .LBB3_2: ; %Flow
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9]
|
||||
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9]
|
||||
; GFX9-NEXT: s_cbranch_execz .LBB3_4
|
||||
; GFX9-NEXT: ; %bb.3:
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2
|
||||
|
@ -831,8 +827,7 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
|
|||
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX9-NEXT: .LBB8_2: ; %Flow
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[10:11]
|
||||
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
|
||||
; GFX9-NEXT: s_cbranch_execz .LBB8_4
|
||||
; GFX9-NEXT: ; %bb.3:
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2
|
||||
|
@ -984,8 +979,7 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
|
|||
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX9-NEXT: .LBB9_2: ; %Flow
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9]
|
||||
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9]
|
||||
; GFX9-NEXT: s_cbranch_execz .LBB9_4
|
||||
; GFX9-NEXT: ; %bb.3:
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2
|
||||
|
|
|
@ -149,8 +149,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb
|
|||
; GCN-NEXT: ; %bb.{{[0-9]+}}:
|
||||
; GCN: store_dword
|
||||
; GCN: {{^}}[[THEN_INNER]]:
|
||||
; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_INNER3:s\[[0-9:]+\]]], [[SAVEEXEC_INNER2]]
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_INNER3]]
|
||||
; GCN-NEXT: s_andn2_saveexec_b64 [[SAVEEXEC_INNER2]], [[SAVEEXEC_INNER2]]
|
||||
; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]]
|
||||
; GCN: store_dword
|
||||
; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:
|
||||
|
@ -241,8 +240,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b
|
|||
; GCN-NEXT: {{^}}[[THEN_OUTER_FLOW]]:
|
||||
; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_ELSE]]
|
||||
; GCN: {{^}}[[THEN_OUTER]]:
|
||||
; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_OUTER3:s\[[0-9:]+\]]], [[SAVEEXEC_OUTER2]]
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_OUTER3]]
|
||||
; GCN-NEXT: s_andn2_saveexec_b64 [[SAVEEXEC_OUTER2]], [[SAVEEXEC_OUTER2]]
|
||||
; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]]
|
||||
; GCN-NEXT: ; %bb.{{[0-9]+}}:
|
||||
; GCN: store_dword
|
||||
|
@ -252,7 +250,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b
|
|||
; GCN: store_dword
|
||||
; GCN-NEXT: [[FLOW1]]:
|
||||
; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_ELSE]]
|
||||
; GCN: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]]
|
||||
; GCN: s_or_b64 exec, exec, [[SAVEEXEC_OUTER2]]
|
||||
; GCN: ds_write_b32
|
||||
; GCN: s_endpgm
|
||||
;
|
||||
|
|
|
@ -3,8 +3,7 @@
|
|||
|
||||
; CHECK-LABEL: {{^}}else_no_execfix:
|
||||
; CHECK: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]],
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]]
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]], [[DST]]
|
||||
define amdgpu_ps float @else_no_execfix(i32 %z, float %v) #0 {
|
||||
main_body:
|
||||
%cc = icmp sgt i32 %z, 5
|
||||
|
|
|
@ -28,8 +28,7 @@ define amdgpu_ps void @return_void(float %0) #0 {
|
|||
; CHECK-NEXT: s_mov_b64 vcc, 0
|
||||
; CHECK-NEXT: s_branch .LBB0_1
|
||||
; CHECK-NEXT: .LBB0_3: ; %Flow1
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[0:1], s[2:3]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[0:1]
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[2:3]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_5
|
||||
; CHECK-NEXT: ; %bb.4: ; %end
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
|
||||
|
@ -73,8 +72,7 @@ define amdgpu_ps void @return_void_compr(float %0) #0 {
|
|||
; CHECK-NEXT: s_mov_b64 vcc, 0
|
||||
; CHECK-NEXT: s_branch .LBB1_1
|
||||
; CHECK-NEXT: .LBB1_3: ; %Flow1
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[0:1], s[2:3]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[0:1]
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[2:3]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB1_5
|
||||
; CHECK-NEXT: ; %bb.4: ; %end
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
|
|
|
@ -182,8 +182,7 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr
|
|||
; CHECK-NEXT: ; %bb.1: ; %ELSE
|
||||
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
|
||||
; CHECK-NEXT: .LBB6_2: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[0:1]
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB6_4
|
||||
; CHECK-NEXT: ; %bb.3: ; %IF
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s12
|
||||
|
@ -238,8 +237,7 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr
|
|||
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
|
||||
; CHECK-NEXT: .LBB7_2: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[0:1], s[16:17]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[0:1]
|
||||
; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[16:17]
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB7_4
|
||||
; CHECK-NEXT: ; %bb.3: ; %IF
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s12
|
||||
|
|
|
@ -73,8 +73,7 @@
|
|||
; GCN-NEXT: s_and_b64 [[EXIT1]], vcc, exec
|
||||
|
||||
; GCN: ; %Flow
|
||||
; GCN-NEXT: s_or_saveexec_b64
|
||||
; GCN-NEXT: s_xor_b64
|
||||
; GCN-NEXT: s_andn2_saveexec_b64
|
||||
|
||||
; GCN: ; %LeafBlock
|
||||
; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1,
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE32 %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE64 %s
|
||||
|
||||
---
|
||||
|
||||
# After the Optimize exec masking (post-RA) pass, codegen can end up with the following sequence:
|
||||
# s_or_saveexec_b32 s0, s0
|
||||
# s_xor_b32 exec_lo, exec_lo, s0
|
||||
#
|
||||
# This can be combined into one instruction:
|
||||
# s_andn2_saveexec_b32 s0, s0
|
||||
|
||||
# Ensure the transformation gets applied in the b32 case.
|
||||
# GCN-LABEL: name: s_or_saveexec_xor_combine_b32
|
||||
# WAVE32: S_ANDN2_SAVEEXEC_B32
|
||||
name: s_or_saveexec_xor_combine_b32
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0
|
||||
renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
$exec_lo = S_XOR_B32 $exec_lo, renamable $sgpr0, implicit-def $scc
|
||||
...
|
||||
|
||||
---
|
||||
|
||||
# Ensure the transformation gets applied in the b64 case.
|
||||
# GCN-LABEL: name: s_or_saveexec_xor_combine_b64
|
||||
# WAVE64: S_ANDN2_SAVEEXEC_B64
|
||||
name: s_or_saveexec_xor_combine_b64
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0_sgpr1
|
||||
renamable $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
$exec = S_XOR_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc
|
||||
...
|
||||
|
||||
---
|
||||
|
||||
# Ensure the transformation does get applied even if the operands are swapped.
|
||||
# GCN-LABEL: name: s_or_saveexec_xor_combine_b32_swap
|
||||
# WAVE32: S_ANDN2_SAVEEXEC_B32
|
||||
name: s_or_saveexec_xor_combine_b32_swap
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0
|
||||
renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
$exec_lo = S_XOR_B32 renamable $sgpr0, $exec_lo, implicit-def $scc
|
||||
...
|
||||
|
||||
---
|
||||
|
||||
# Ensure the transformation does get applied if source and dest operand for s_or_saveeexec are not equal.
|
||||
# GCN-LABEL: name: s_or_saveexec_xor_combine_b32_inequal_operands
|
||||
# WAVE32: S_ANDN2_SAVEEXEC
|
||||
name: s_or_saveexec_xor_combine_b32_inequal_operands
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0, $sgpr1
|
||||
renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr1, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
$exec_lo = S_XOR_B32 $exec_lo, renamable $sgpr0, implicit-def $scc
|
||||
...
|
||||
|
||||
---
|
||||
|
||||
# Ensure the transformation does not get applied if s_xor does not use the dest as input operand.
|
||||
# GCN-LABEL: name: s_or_saveexec_xor_combine_b32_wrong_input
|
||||
# WAVE32: S_OR_SAVEEXEC
|
||||
# WAVE32: S_XOR_B32
|
||||
name: s_or_saveexec_xor_combine_b32_wrong_input
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0, $sgpr1
|
||||
renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
$exec_lo = S_XOR_B32 $exec_lo, renamable $sgpr1, implicit-def $scc
|
||||
...
|
||||
|
||||
---
|
||||
|
||||
|
||||
# Ensure the transformation does not get applied if the instructions don't appear sequentially.
|
||||
# GCN-LABEL: name: s_or_saveexec_xor_combine_b32_non_sequence
|
||||
# WAVE32: S_OR_SAVEEXEC
|
||||
# WAVE32: S_MOV_B32
|
||||
# WAVE32: S_XOR_B32
|
||||
name: s_or_saveexec_xor_combine_b32_non_sequence
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0, $sgpr1
|
||||
renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
renamable $sgpr1 = S_MOV_B32 renamable $sgpr0
|
||||
$exec_lo = S_XOR_B32 $exec_lo, renamable $sgpr1, implicit-def $scc
|
||||
...
|
||||
|
||||
---
|
||||
|
||||
# Don't apply the transformation if the basic block only has a single instruction.
|
||||
|
||||
# GCN-LABEL: name: s_or_saveexec_xor_combine_b32_last_inst
|
||||
# WAVE32: S_OR_SAVEEXEC
|
||||
name: s_or_saveexec_xor_combine_b32_last_inst
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0
|
||||
renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
|
||||
# Don't apply the transformation if the basic block ends with an S_OR_SAVEEXEC_B32 instruction.
|
||||
|
||||
# GCN-LABEL: name: s_or_saveexec_xor_combine_b32_or_saveexec_terminator
|
||||
# WAVE32: S_MOV_B32
|
||||
# WAVE32: S_OR_SAVEEXEC
|
||||
name: s_or_saveexec_xor_combine_b32_or_saveexec_terminator
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0, $sgpr1
|
||||
renamable $sgpr1 = S_MOV_B32 renamable $sgpr0
|
||||
renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
|
|
@ -73,8 +73,13 @@ define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg
|
|||
; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0
|
||||
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
|
||||
; GCN-NEXT: s_xor_b32 s0, exec_lo, s0
|
||||
; GCN-NEXT: s_cbranch_execz .LBB1_4
|
||||
; GCN-NEXT: ; %bb.3: ; %.else
|
||||
; GCN-NEXT: s_cbranch_execnz .LBB1_5
|
||||
; GCN-NEXT: ; %bb.3: ; %Flow
|
||||
; GCN-NEXT: s_andn2_saveexec_b32 s0, s0
|
||||
; GCN-NEXT: s_cbranch_execnz .LBB1_6
|
||||
; GCN-NEXT: .LBB1_4: ; %.end
|
||||
; GCN-NEXT: s_endpgm
|
||||
; GCN-NEXT: .LBB1_5: ; %.else
|
||||
; GCN-NEXT: s_or_saveexec_b32 s1, -1
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_mov_b32 exec_lo, s1
|
||||
|
@ -89,15 +94,11 @@ define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg
|
|||
; GCN-NEXT: v_mov_b32_e32 v3, -1
|
||||
; GCN-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen
|
||||
; GCN-NEXT: ; implicit-def: $vgpr3
|
||||
; GCN-NEXT: .LBB1_4: ; %Flow
|
||||
; GCN-NEXT: s_or_saveexec_b32 s0, s0
|
||||
; GCN-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
|
||||
; GCN-NEXT: s_cbranch_execz .LBB1_6
|
||||
; GCN-NEXT: ; %bb.5: ; %.then
|
||||
; GCN-NEXT: s_andn2_saveexec_b32 s0, s0
|
||||
; GCN-NEXT: s_cbranch_execz .LBB1_4
|
||||
; GCN-NEXT: .LBB1_6: ; %.then
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, -1
|
||||
; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen
|
||||
; GCN-NEXT: .LBB1_6: ; %.end
|
||||
; GCN-NEXT: s_endpgm
|
||||
.entry:
|
||||
%LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0
|
||||
|
|
|
@ -174,8 +174,7 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out,
|
|||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: .LBB3_2: ; %Flow
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_or_saveexec_b64 s[0:1], s[10:11]
|
||||
; SI-NEXT: s_xor_b64 exec, exec, s[0:1]
|
||||
; SI-NEXT: s_andn2_saveexec_b64 s[0:1], s[10:11]
|
||||
; SI-NEXT: s_cbranch_execz .LBB3_4
|
||||
; SI-NEXT: ; %bb.3: ; %if
|
||||
; SI-NEXT: s_mov_b32 s15, 0xf000
|
||||
|
|
|
@ -81,58 +81,60 @@ else: ; preds = %else.if.cond
|
|||
}
|
||||
|
||||
define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(float %val) #0 {
|
||||
; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill
|
||||
; GCN: bb.0.entry:
|
||||
; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000)
|
||||
; GCN: liveins: $vgpr0
|
||||
; GCN: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec
|
||||
; GCN: $sgpr0_sgpr1 = S_MOV_B64 $exec
|
||||
; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec
|
||||
; GCN: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GCN: renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc
|
||||
; GCN: S_CBRANCH_EXECZ %bb.4, implicit $exec
|
||||
; GCN: bb.1.flow.preheader:
|
||||
; GCN: successors: %bb.2(0x80000000)
|
||||
; GCN: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3
|
||||
; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec
|
||||
; GCN: renamable $sgpr4_sgpr5 = S_MOV_B64 0
|
||||
; GCN: bb.2.flow:
|
||||
; GCN: successors: %bb.3(0x04000000), %bb.2(0x7c000000)
|
||||
; GCN: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
|
||||
; GCN: renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc
|
||||
; GCN: renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc
|
||||
; GCN: $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc
|
||||
; GCN: S_CBRANCH_EXECNZ %bb.2, implicit $exec
|
||||
; GCN: bb.3.Flow:
|
||||
; GCN: successors: %bb.4(0x80000000)
|
||||
; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
|
||||
; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
|
||||
; GCN: bb.4.Flow1:
|
||||
; GCN: successors: %bb.5(0x40000000), %bb.7(0x40000000)
|
||||
; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
|
||||
; GCN: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GCN: $exec = S_XOR_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc
|
||||
; GCN: S_CBRANCH_EXECZ %bb.7, implicit $exec
|
||||
; GCN: bb.5.kill0:
|
||||
; GCN: successors: %bb.6(0x40000000), %bb.8(0x40000000)
|
||||
; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
|
||||
; GCN: dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc
|
||||
; GCN: S_CBRANCH_SCC0 %bb.8, implicit $scc
|
||||
; GCN: bb.6.kill0:
|
||||
; GCN: successors: %bb.7(0x80000000)
|
||||
; GCN: liveins: $sgpr2_sgpr3, $scc
|
||||
; GCN: $exec = S_MOV_B64 0
|
||||
; GCN: bb.7.end:
|
||||
; GCN: successors: %bb.9(0x80000000)
|
||||
; GCN: liveins: $sgpr2_sgpr3
|
||||
; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
|
||||
; GCN: S_BRANCH %bb.9
|
||||
; GCN: bb.8:
|
||||
; GCN: $exec = S_MOV_B64 0
|
||||
; GCN: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0
|
||||
; GCN: bb.9:
|
||||
entry:
|
||||
; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill
|
||||
; GCN: bb.0 (%ir-block.0):
|
||||
; GCN: successors: %bb.3(0x40000000), %bb.1(0x40000000)
|
||||
; GCN: liveins: $vgpr0
|
||||
; GCN: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec
|
||||
; GCN: $sgpr0_sgpr1 = S_MOV_B64 $exec
|
||||
; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec
|
||||
; GCN: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GCN: renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc
|
||||
; GCN: S_CBRANCH_EXECNZ %bb.3, implicit $exec
|
||||
; GCN: bb.1.Flow1:
|
||||
; GCN: successors: %bb.6(0x40000000), %bb.2(0x40000000)
|
||||
; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
|
||||
; GCN: $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GCN: S_CBRANCH_EXECNZ %bb.6, implicit $exec
|
||||
; GCN: bb.2.end:
|
||||
; GCN: successors: %bb.9(0x80000000)
|
||||
; GCN: liveins: $sgpr2_sgpr3
|
||||
; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
|
||||
; GCN: S_BRANCH %bb.9
|
||||
; GCN: bb.3.flow.preheader:
|
||||
; GCN: successors: %bb.4(0x80000000)
|
||||
; GCN: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3
|
||||
; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec
|
||||
; GCN: renamable $sgpr4_sgpr5 = S_MOV_B64 0
|
||||
; GCN: bb.4.flow:
|
||||
; GCN: successors: %bb.5(0x04000000), %bb.4(0x7c000000)
|
||||
; GCN: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
|
||||
; GCN: renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc
|
||||
; GCN: renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc
|
||||
; GCN: $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc
|
||||
; GCN: S_CBRANCH_EXECNZ %bb.4, implicit $exec
|
||||
; GCN: bb.5.Flow:
|
||||
; GCN: successors: %bb.6(0x40000000), %bb.2(0x40000000)
|
||||
; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
|
||||
; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
|
||||
; GCN: $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GCN: S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||
; GCN: bb.6.kill0:
|
||||
; GCN: successors: %bb.7(0x40000000), %bb.8(0x40000000)
|
||||
; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
|
||||
; GCN: dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc
|
||||
; GCN: S_CBRANCH_SCC0 %bb.8, implicit $scc
|
||||
; GCN: bb.7.kill0:
|
||||
; GCN: successors: %bb.9(0x80000000)
|
||||
; GCN: liveins: $sgpr2_sgpr3, $scc
|
||||
; GCN: $exec = S_MOV_B64 0
|
||||
; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
|
||||
; GCN: S_BRANCH %bb.9
|
||||
; GCN: bb.8:
|
||||
; GCN: $exec = S_MOV_B64 0
|
||||
; GCN: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0
|
||||
; GCN: bb.9:
|
||||
%.i0 = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, %val
|
||||
%cmp0 = fcmp olt float %.i0, 0.000000e+00
|
||||
br i1 %cmp0, label %kill0, label %flow
|
||||
|
|
|
@ -22,8 +22,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|||
|
||||
; v_mov should be after exec modification
|
||||
; SI: [[FLOW_BB]]:
|
||||
; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
|
||||
; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
|
||||
; SI-NEXT: s_andn2_saveexec_b64 [[SAVE2]], [[SAVE2]]
|
||||
;
|
||||
define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
|
||||
entry:
|
||||
|
@ -121,8 +120,7 @@ exit:
|
|||
; SI: s_cbranch_execnz [[EXIT:.LBB[0-9]+_[0-9]+]]
|
||||
|
||||
; SI-NEXT: {{^.LBB[0-9]+_[0-9]+}}: ; %Flow
|
||||
; SI-NEXT: s_or_saveexec_b64
|
||||
; SI-NEXT: s_xor_b64 exec, exec
|
||||
; SI-NEXT: s_andn2_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
|
||||
; SI-NEXT: s_cbranch_execz [[UNIFIED_RETURN:.LBB[0-9]+_[0-9]+]]
|
||||
|
||||
; SI-NEXT: ; %bb.{{[0-9]+}}: ; %then
|
||||
|
@ -163,7 +161,6 @@ exit:
|
|||
; SI: s_cbranch_scc1 [[LABEL_LOOP]]
|
||||
; SI: [[LABEL_EXIT]]:
|
||||
; SI: s_endpgm
|
||||
|
||||
define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
|
@ -223,7 +220,6 @@ exit:
|
|||
; SI: [[LABEL_EXIT]]:
|
||||
; SI-NOT: [[COND_STATE]]
|
||||
; SI: s_endpgm
|
||||
|
||||
define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
|
||||
bb:
|
||||
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
|
|
|
@ -134,7 +134,7 @@ l2:
|
|||
; any of the v_cmp source operands.
|
||||
|
||||
; GCN-LABEL: check_saveexec_overwrites_vcmp_source:
|
||||
; GCN: ; %bb.1: ; %then
|
||||
; GCN: .LBB7_3: ; %then
|
||||
; GFX1010: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
|
||||
; GFX1010-NEXT: v_mov_b32_e32 {{.*}}, s[[A]]
|
||||
; GFX1010-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo
|
||||
|
|
|
@ -9,17 +9,23 @@ define amdgpu_ps float @else1(i32 %z, float %v) #0 {
|
|||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo
|
||||
; SI-NEXT: s_xor_b32 s0, exec_lo, s0
|
||||
; SI-NEXT: ; %bb.1: ; %else
|
||||
; SI-NEXT: s_cbranch_execnz .LBB0_3
|
||||
; SI-NEXT: ; %bb.1: ; %Flow
|
||||
; SI-NEXT: s_andn2_saveexec_b32 s0, s0
|
||||
; SI-NEXT: s_cbranch_execnz .LBB0_4
|
||||
; SI-NEXT: .LBB0_2: ; %end
|
||||
; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; SI-NEXT: s_branch .LBB0_5
|
||||
; SI-NEXT: .LBB0_3: ; %else
|
||||
; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
|
||||
; SI-NEXT: ; implicit-def: $vgpr1
|
||||
; SI-NEXT: ; %bb.2: ; %Flow
|
||||
; SI-NEXT: s_or_saveexec_b32 s0, s0
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s0
|
||||
; SI-NEXT: ; %bb.3: ; %if
|
||||
; SI-NEXT: s_andn2_saveexec_b32 s0, s0
|
||||
; SI-NEXT: s_cbranch_execz .LBB0_2
|
||||
; SI-NEXT: .LBB0_4: ; %if
|
||||
; SI-NEXT: v_add_f32_e32 v0, v1, v1
|
||||
; SI-NEXT: ; %bb.4: ; %end
|
||||
; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; SI-NEXT: ; return to shader part epilog
|
||||
; SI-NEXT: s_branch .LBB0_5
|
||||
; SI-NEXT: .LBB0_5:
|
||||
main_body:
|
||||
%cc = icmp sgt i32 %z, 5
|
||||
br i1 %cc, label %if, label %else
|
||||
|
@ -49,8 +55,7 @@ define amdgpu_ps float @else2(i32 %z, float %v) #0 {
|
|||
; SI-NEXT: ; %bb.1: ; %else
|
||||
; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
|
||||
; SI-NEXT: ; %bb.2: ; %Flow
|
||||
; SI-NEXT: s_or_saveexec_b32 s0, s0
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s0
|
||||
; SI-NEXT: s_andn2_saveexec_b32 s0, s0
|
||||
; SI-NEXT: ; %bb.3: ; %if
|
||||
; SI-NEXT: v_add_f32_e32 v1, v1, v1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, v1
|
||||
|
@ -104,8 +109,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
|
|||
; SI-NEXT: ; implicit-def: $vgpr2
|
||||
; SI-NEXT: ; %bb.4: ; %Flow
|
||||
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
|
||||
; SI-NEXT: s_or_saveexec_b32 s2, s2
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s2
|
||||
; SI-NEXT: s_andn2_saveexec_b32 s2, s2
|
||||
; SI-NEXT: s_cbranch_execz .LBB2_1
|
||||
; SI-NEXT: ; %bb.5: ; %if
|
||||
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
|
||||
|
@ -191,8 +195,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
|
|||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: ; implicit-def: $vgpr2
|
||||
; SI-NEXT: .LBB3_4: ; %Flow
|
||||
; SI-NEXT: s_or_saveexec_b32 s6, s6
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s6
|
||||
; SI-NEXT: s_andn2_saveexec_b32 s6, s6
|
||||
; SI-NEXT: s_cbranch_execz .LBB3_8
|
||||
; SI-NEXT: ; %bb.5: ; %if
|
||||
; SI-NEXT: s_mov_b32 s7, exec_lo
|
||||
|
@ -267,8 +270,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(
|
|||
; SI-NEXT: s_mov_b32 exec_lo, s7
|
||||
; SI-NEXT: ; implicit-def: $vgpr2
|
||||
; SI-NEXT: .LBB4_4: ; %Flow
|
||||
; SI-NEXT: s_or_saveexec_b32 s6, s6
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s6
|
||||
; SI-NEXT: s_andn2_saveexec_b32 s6, s6
|
||||
; SI-NEXT: s_cbranch_execz .LBB4_8
|
||||
; SI-NEXT: ; %bb.5: ; %if
|
||||
; SI-NEXT: s_mov_b32 s7, exec_lo
|
||||
|
|
|
@ -1232,8 +1232,7 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr
|
|||
; GFX9-W64-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; GFX9-W64-NEXT: .LBB23_2: ; %Flow
|
||||
; GFX9-W64-NEXT: s_or_saveexec_b64 s[14:15], s[14:15]
|
||||
; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[14:15]
|
||||
; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15]
|
||||
; GFX9-W64-NEXT: s_cbranch_execz .LBB23_4
|
||||
; GFX9-W64-NEXT: ; %bb.3: ; %IF
|
||||
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
|
||||
|
@ -1260,8 +1259,7 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr
|
|||
; GFX10-W32-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
|
||||
; GFX10-W32-NEXT: .LBB23_2: ; %Flow
|
||||
; GFX10-W32-NEXT: s_or_saveexec_b32 s13, s13
|
||||
; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s13
|
||||
; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13
|
||||
; GFX10-W32-NEXT: s_cbranch_execz .LBB23_4
|
||||
; GFX10-W32-NEXT: ; %bb.3: ; %IF
|
||||
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
|
||||
|
@ -1396,8 +1394,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
|
|||
; GFX9-W64-NEXT: v_lshlrev_b32_e32 v0, 2, v5
|
||||
; GFX9-W64-NEXT: ; implicit-def: $vgpr5
|
||||
; GFX9-W64-NEXT: ; %bb.2: ; %Flow
|
||||
; GFX9-W64-NEXT: s_or_saveexec_b64 s[14:15], s[14:15]
|
||||
; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[14:15]
|
||||
; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15]
|
||||
; GFX9-W64-NEXT: ; %bb.3: ; %IF
|
||||
; GFX9-W64-NEXT: v_mul_lo_u32 v0, v5, 3
|
||||
; GFX9-W64-NEXT: ; %bb.4: ; %END
|
||||
|
@ -1427,8 +1424,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
|
|||
; GFX10-W32-NEXT: v_lshlrev_b32_e32 v0, 2, v5
|
||||
; GFX10-W32-NEXT: ; implicit-def: $vgpr5
|
||||
; GFX10-W32-NEXT: ; %bb.2: ; %Flow
|
||||
; GFX10-W32-NEXT: s_or_saveexec_b32 s13, s13
|
||||
; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s13
|
||||
; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13
|
||||
; GFX10-W32-NEXT: ; %bb.3: ; %IF
|
||||
; GFX10-W32-NEXT: v_mul_lo_u32 v0, v5, 3
|
||||
; GFX10-W32-NEXT: ; %bb.4: ; %END
|
||||
|
@ -1486,18 +1482,25 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr
|
|||
; GFX9-W64-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
||||
; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
||||
; GFX9-W64-NEXT: ; %bb.1: ; %ELSE
|
||||
; GFX9-W64-NEXT: v_mul_f32_e32 v0, 4.0, v1
|
||||
; GFX9-W64-NEXT: ; implicit-def: $vgpr1
|
||||
; GFX9-W64-NEXT: ; %bb.2: ; %Flow
|
||||
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
|
||||
; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1]
|
||||
; GFX9-W64-NEXT: ; %bb.3: ; %IF
|
||||
; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
|
||||
; GFX9-W64-NEXT: ; %bb.4: ; %END
|
||||
; GFX9-W64-NEXT: s_cbranch_execnz .LBB26_3
|
||||
; GFX9-W64-NEXT: ; %bb.1: ; %Flow
|
||||
; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
||||
; GFX9-W64-NEXT: s_cbranch_execnz .LBB26_4
|
||||
; GFX9-W64-NEXT: .LBB26_2: ; %END
|
||||
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-W64-NEXT: ; return to shader part epilog
|
||||
; GFX9-W64-NEXT: s_branch .LBB26_5
|
||||
; GFX9-W64-NEXT: .LBB26_3: ; %ELSE
|
||||
; GFX9-W64-NEXT: v_mul_f32_e32 v0, 4.0, v1
|
||||
; GFX9-W64-NEXT: ; implicit-def: $vgpr1
|
||||
; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
||||
; GFX9-W64-NEXT: s_cbranch_execz .LBB26_2
|
||||
; GFX9-W64-NEXT: .LBB26_4: ; %IF
|
||||
; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
|
||||
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-W64-NEXT: s_branch .LBB26_5
|
||||
; GFX9-W64-NEXT: .LBB26_5:
|
||||
;
|
||||
; GFX10-W32-LABEL: test_control_flow_3:
|
||||
; GFX10-W32: ; %bb.0: ; %main_body
|
||||
|
@ -1513,18 +1516,25 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr
|
|||
; GFX10-W32-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1
|
||||
; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0
|
||||
; GFX10-W32-NEXT: ; %bb.1: ; %ELSE
|
||||
; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1
|
||||
; GFX10-W32-NEXT: ; implicit-def: $vgpr1
|
||||
; GFX10-W32-NEXT: ; %bb.2: ; %Flow
|
||||
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s0
|
||||
; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0
|
||||
; GFX10-W32-NEXT: ; %bb.3: ; %IF
|
||||
; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
|
||||
; GFX10-W32-NEXT: ; %bb.4: ; %END
|
||||
; GFX10-W32-NEXT: s_cbranch_execnz .LBB26_3
|
||||
; GFX10-W32-NEXT: ; %bb.1: ; %Flow
|
||||
; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0
|
||||
; GFX10-W32-NEXT: s_cbranch_execnz .LBB26_4
|
||||
; GFX10-W32-NEXT: .LBB26_2: ; %END
|
||||
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-W32-NEXT: ; return to shader part epilog
|
||||
; GFX10-W32-NEXT: s_branch .LBB26_5
|
||||
; GFX10-W32-NEXT: .LBB26_3: ; %ELSE
|
||||
; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1
|
||||
; GFX10-W32-NEXT: ; implicit-def: $vgpr1
|
||||
; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0
|
||||
; GFX10-W32-NEXT: s_cbranch_execz .LBB26_2
|
||||
; GFX10-W32-NEXT: .LBB26_4: ; %IF
|
||||
; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
|
||||
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-W32-NEXT: s_branch .LBB26_5
|
||||
; GFX10-W32-NEXT: .LBB26_5:
|
||||
main_body:
|
||||
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
||||
%tex0 = extractelement <4 x float> %tex, i32 0
|
||||
|
|
Loading…
Reference in New Issue