[AMDGPU] Add scheduler pass to rematerialize trivial defs
Add a new pass in the pre-ra AMDGPU scheduler to check if sinking trivially rematerializable defs that only has one use outside of the defining block will increase occupancy. If we can determine that occupancy can be increased, then rematerialize only the minimum amount of defs required to increase occupancy. Also re-schedule all regions that had occupancy matching the previous min occupancy using the new occupancy. This is based off of the discussion in https://reviews.llvm.org/D117562. The logic to determine the defs we should collect and determining if sinking would be beneficial is mostly the same. Main differences is that we are no longer limiting it to immediate defs and the def and use does not have to be part of a loop. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D119475
This commit is contained in:
parent
b0f7dc2cf0
commit
28322c2514
|
@ -362,6 +362,9 @@ void GCNScheduleDAGMILive::schedule() {
|
|||
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
|
||||
PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
|
||||
Pressure[RegionIdx] = PressureAfter;
|
||||
RegionsWithMinOcc[RegionIdx] =
|
||||
PressureAfter.getOccupancy(ST) == MinOccupancy;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
|
||||
return;
|
||||
}
|
||||
|
@ -378,6 +381,7 @@ void GCNScheduleDAGMILive::schedule() {
|
|||
// occupancy before was higher, or if the current schedule has register
|
||||
// pressure higher than the excess limits which could lead to more spilling.
|
||||
unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
|
||||
|
||||
// Allow memory bound functions to drop to 4 waves if not limited by an
|
||||
// attribute.
|
||||
if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
|
||||
|
@ -390,6 +394,7 @@ void GCNScheduleDAGMILive::schedule() {
|
|||
if (NewOccupancy < MinOccupancy) {
|
||||
MinOccupancy = NewOccupancy;
|
||||
MFI.limitOccupancy(MinOccupancy);
|
||||
RegionsWithMinOcc.reset();
|
||||
LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
|
||||
<< MinOccupancy << ".\n");
|
||||
}
|
||||
|
@ -416,6 +421,8 @@ void GCNScheduleDAGMILive::schedule() {
|
|||
PressureAfter.less(ST, PressureBefore) ||
|
||||
!RescheduleRegions[RegionIdx]) {
|
||||
Pressure[RegionIdx] = PressureAfter;
|
||||
RegionsWithMinOcc[RegionIdx] =
|
||||
PressureAfter.getOccupancy(ST) == MinOccupancy;
|
||||
if (!RegionsWithClusters[RegionIdx] &&
|
||||
(Stage + 1) == UnclusteredReschedule)
|
||||
RescheduleRegions[RegionIdx] = false;
|
||||
|
@ -425,6 +432,8 @@ void GCNScheduleDAGMILive::schedule() {
|
|||
}
|
||||
}
|
||||
|
||||
RegionsWithMinOcc[RegionIdx] =
|
||||
PressureBefore.getOccupancy(ST) == MinOccupancy;
|
||||
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
|
||||
RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] ||
|
||||
(Stage + 1) != UnclusteredReschedule;
|
||||
|
@ -585,9 +594,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
|
|||
RescheduleRegions.resize(Regions.size());
|
||||
RegionsWithClusters.resize(Regions.size());
|
||||
RegionsWithHighRP.resize(Regions.size());
|
||||
RegionsWithMinOcc.resize(Regions.size());
|
||||
RescheduleRegions.set();
|
||||
RegionsWithClusters.reset();
|
||||
RegionsWithHighRP.reset();
|
||||
RegionsWithMinOcc.reset();
|
||||
|
||||
if (!Regions.empty())
|
||||
BBLiveInMap = getBBLiveInMap();
|
||||
|
@ -624,13 +635,42 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
|
|||
<< "Retrying function scheduling with lowest recorded occupancy "
|
||||
<< MinOccupancy << ".\n");
|
||||
}
|
||||
|
||||
if (Stage == PreRARematerialize) {
|
||||
if (RegionsWithMinOcc.count() != 1 || Regions.size() == 1)
|
||||
break;
|
||||
|
||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
|
||||
// Check maximum occupancy
|
||||
if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
|
||||
MinOccupancy)
|
||||
break;
|
||||
|
||||
// FIXME: This pass will invalidate cached LiveIns, MBBLiveIns and
|
||||
// Pressure for regions inbetween the defs and region we sinked the def
|
||||
// to. Will need to be fixed if there is another pass after this pass.
|
||||
static_assert(LastStage == PreRARematerialize,
|
||||
"Passes after PreRARematerialize are not supported");
|
||||
|
||||
unsigned HighRPIdx = RegionsWithMinOcc.find_first();
|
||||
collectRematerializableInstructions(HighRPIdx);
|
||||
if (RematerializableInsts.empty() ||
|
||||
!sinkTriviallyRematInsts(ST, TII, HighRPIdx))
|
||||
break;
|
||||
|
||||
LLVM_DEBUG(
|
||||
dbgs() << "Retrying function scheduling with improved occupancy of "
|
||||
<< MinOccupancy << " from rematerializing\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (Stage == UnclusteredReschedule)
|
||||
SavedMutations.swap(Mutations);
|
||||
|
||||
for (auto Region : Regions) {
|
||||
if ((Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) ||
|
||||
if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) &&
|
||||
!RescheduleRegions[RegionIdx]) ||
|
||||
(Stage == ClusteredLowOccupancyReschedule &&
|
||||
!RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) {
|
||||
|
||||
|
@ -655,6 +695,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
|
|||
// Skip empty scheduling regions (0 or 1 schedulable instructions).
|
||||
if (begin() == end() || begin() == std::prev(end())) {
|
||||
exitRegion();
|
||||
++RegionIdx;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -677,3 +718,211 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
|
|||
SavedMutations.swap(Mutations);
|
||||
} while (Stage != LastStage);
|
||||
}
|
||||
|
||||
void GCNScheduleDAGMILive::collectRematerializableInstructions(
|
||||
unsigned HighRPIdx) {
|
||||
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
|
||||
const GCNRPTracker::LiveRegSet &HighRPLiveIns = LiveIns[HighRPIdx];
|
||||
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
|
||||
Register Reg = Register::index2VirtReg(I);
|
||||
if (!LIS->hasInterval(Reg))
|
||||
continue;
|
||||
|
||||
// TODO: Handle AGPR and SGPR rematerialization
|
||||
if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) ||
|
||||
!MRI.hasOneUse(Reg))
|
||||
continue;
|
||||
|
||||
// We are only collecting defs that are live-through or defined in another
|
||||
// block and used inside this region. This means that the register must be
|
||||
// in the live-in set for this region, else skip this def.
|
||||
if (HighRPLiveIns.find(Reg) == HighRPLiveIns.end())
|
||||
continue;
|
||||
|
||||
MachineInstr *Def = MRI.getOneDef(Reg)->getParent();
|
||||
if (!Def || !isTriviallyReMaterializable(*Def, AA))
|
||||
continue;
|
||||
|
||||
MachineInstr *UseI = &*MRI.use_instr_begin(Reg);
|
||||
if (Def->getParent() == UseI->getParent())
|
||||
continue;
|
||||
|
||||
RematerializableInsts.push_back(std::make_pair(Def, UseI));
|
||||
}
|
||||
}
|
||||
|
||||
bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
|
||||
const TargetInstrInfo *TII,
|
||||
unsigned HighRPIdx) {
|
||||
RescheduleRegions.reset();
|
||||
GCNRPTracker::LiveRegSet NewLiveIns;
|
||||
// We may not need to rematerialize all instructions. Keep a list of
|
||||
// instructions we are rematerializing at the end.
|
||||
SmallVector<std::pair<MachineInstr *, MachineInstr *>, 4>
|
||||
TrivialRematDefsToSink;
|
||||
|
||||
GCNRegPressure RegionPressure = Pressure[HighRPIdx];
|
||||
int VGPRUsage = RegionPressure.getVGPRNum(ST.hasGFX90AInsts());
|
||||
int SGPRUsage = RegionPressure.getSGPRNum();
|
||||
|
||||
// TODO: Handle occupancy drop due to AGPR and SGPR.
|
||||
// Check if cause of occupancy drop is due to VGPR usage.
|
||||
if (ST.getOccupancyWithNumVGPRs(VGPRUsage) > MinOccupancy ||
|
||||
ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy)
|
||||
return false;
|
||||
|
||||
NewLiveIns.copyFrom(LiveIns[HighRPIdx]);
|
||||
// First check if we have enough trivially rematerializable instructions to
|
||||
// improve occupancy. Optimistically assume all instructions we are able to
|
||||
// sink decreased RP.
|
||||
int TotalSinkableRegs = 0;
|
||||
for (auto &It : RematerializableInsts) {
|
||||
Register DefReg = It.first->getOperand(0).getReg();
|
||||
TotalSinkableRegs += SIRegisterInfo::getNumCoveredRegs(NewLiveIns[DefReg]);
|
||||
}
|
||||
int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
|
||||
unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
|
||||
// If in the most optimistic scenario, we cannot improve occupancy, then do
|
||||
// not attempt to sink any instructions.
|
||||
if (OptimisticOccupancy <= MinOccupancy)
|
||||
return false;
|
||||
|
||||
// Keep a list of newly rematerialized instructions so that we can easily
|
||||
// undo if occupancy is not improved.
|
||||
DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
|
||||
GCNDownwardRPTracker RPT(*LIS);
|
||||
auto *NonDbgMI = &*skipDebugInstructionsForward(Regions[HighRPIdx].first,
|
||||
Regions[HighRPIdx].second);
|
||||
unsigned ImproveOccupancy = 0;
|
||||
for (auto &It : RematerializableInsts) {
|
||||
MachineInstr *Def = It.first;
|
||||
MachineBasicBlock::iterator InsertPos =
|
||||
MachineBasicBlock::iterator(It.second);
|
||||
Register Reg = Def->getOperand(0).getReg();
|
||||
// Rematerialize MI to its use block. Since we are only rematerializing
|
||||
// instructions that do not have any virtual reg uses, we do not need to
|
||||
// call LiveRangeEdit::allUsesAvailableAt() and
|
||||
// LiveRangeEdit::canRematerializeAt().
|
||||
NewLiveIns[Reg] = LaneBitmask::getNone();
|
||||
TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
|
||||
Def->getOperand(0).getSubReg(), *Def, *TRI);
|
||||
MachineInstr *NewMI = &*(--InsertPos);
|
||||
LIS->InsertMachineInstrInMaps(*NewMI);
|
||||
LIS->removeInterval(Reg);
|
||||
LIS->createAndComputeVirtRegInterval(Reg);
|
||||
InsertedMIToOldDef[NewMI] = Def;
|
||||
|
||||
// FIXME: Need better way to update RP without re-iterating over region
|
||||
RPT.reset(*NonDbgMI, &NewLiveIns);
|
||||
RPT.advance(Regions[HighRPIdx].second);
|
||||
GCNRegPressure RPAfterSinking = RPT.moveMaxPressure();
|
||||
ImproveOccupancy = RPAfterSinking.getOccupancy(ST);
|
||||
if (ImproveOccupancy > MinOccupancy)
|
||||
break;
|
||||
}
|
||||
|
||||
if (ImproveOccupancy <= MinOccupancy) {
|
||||
// Occupancy is not improved. Undo sinking for the region
|
||||
for (auto &Entry : InsertedMIToOldDef) {
|
||||
MachineInstr *MI = Entry.first;
|
||||
MachineInstr *OldMI = Entry.second;
|
||||
Register Reg = MI->getOperand(0).getReg();
|
||||
LIS->RemoveMachineInstrFromMaps(*MI);
|
||||
MI->eraseFromParent();
|
||||
OldMI->clearRegisterDeads(Reg);
|
||||
LIS->removeInterval(Reg);
|
||||
LIS->createAndComputeVirtRegInterval(Reg);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Occupancy is improved.
|
||||
for (auto &Entry : InsertedMIToOldDef) {
|
||||
MachineInstr *MI = Entry.first;
|
||||
MachineInstr *OldMI = Entry.second;
|
||||
// Update region boundaries in scheduling region we sinked from since we
|
||||
// may sink an instruction that was at the beginning or end of its region
|
||||
updateRegionBoundaries(OldMI, /*NewMI =*/nullptr, /*Removing =*/true);
|
||||
|
||||
// Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
|
||||
BBLiveInMap.erase(OldMI);
|
||||
|
||||
// Remove OldMI and update LIS
|
||||
Register Reg = MI->getOperand(0).getReg();
|
||||
LIS->RemoveMachineInstrFromMaps(*OldMI);
|
||||
OldMI->eraseFromParent();
|
||||
LIS->removeInterval(Reg);
|
||||
LIS->createAndComputeVirtRegInterval(Reg);
|
||||
|
||||
// Update region boundaries in region we sinked to.
|
||||
MachineBasicBlock::iterator InsertPos =
|
||||
std::next(MachineBasicBlock::iterator(MI));
|
||||
updateRegionBoundaries(InsertPos, MI);
|
||||
}
|
||||
|
||||
// Update cached live-ins and register pressure after rematerializing
|
||||
LiveIns[HighRPIdx].copyFrom(NewLiveIns);
|
||||
MBBLiveIns.erase(Regions[HighRPIdx].first->getParent());
|
||||
|
||||
GCNDownwardRPTracker RPTracker(*LIS);
|
||||
RPTracker.advance(Regions[HighRPIdx].first, Regions[HighRPIdx].second,
|
||||
&LiveIns[HighRPIdx]);
|
||||
Pressure[HighRPIdx] = RPTracker.moveMaxPressure();
|
||||
|
||||
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
|
||||
MFI.increaseOccupancy(MF, ++MinOccupancy);
|
||||
RescheduleRegions[HighRPIdx] = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Copied from MachineLICM
|
||||
bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI,
|
||||
AAResults *AA) {
|
||||
if (!TII->isTriviallyReMaterializable(MI, AA))
|
||||
return false;
|
||||
|
||||
for (const MachineOperand &MO : MI.operands())
|
||||
if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual())
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// When removing, we will have to check both beginning and ending of the region.
|
||||
// When inserting, we will only have to check if we are inserting NewMI in front
|
||||
// of a scheduling region and do not need to check the ending since we will only
|
||||
// ever be inserting before an already existing MI.
|
||||
void GCNScheduleDAGMILive::updateRegionBoundaries(
|
||||
MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
|
||||
unsigned I = 0, E = Regions.size();
|
||||
// Search for first region of the block where MI is located
|
||||
while (I != E && MI->getParent() != Regions[I].first->getParent())
|
||||
++I;
|
||||
|
||||
for (; I != E; ++I) {
|
||||
if (MI->getParent() != Regions[I].first->getParent())
|
||||
return;
|
||||
|
||||
if (Removing && MI == Regions[I].first && MI == Regions[I].second) {
|
||||
// MI is in a region with size 1, after removing, the region will be
|
||||
// size 0, set RegionBegin and RegionEnd to pass end of block iterator.
|
||||
Regions[I] =
|
||||
std::make_pair(MI->getParent()->end(), MI->getParent()->end());
|
||||
return;
|
||||
}
|
||||
if (MI == Regions[I].first) {
|
||||
if (Removing)
|
||||
Regions[I] = std::make_pair(std::next(MI), Regions[I].second);
|
||||
else
|
||||
// Inserted NewMI in front of region, set new RegionBegin to NewMI
|
||||
Regions[I] = std::make_pair(MachineBasicBlock::iterator(NewMI),
|
||||
Regions[I].second);
|
||||
return;
|
||||
}
|
||||
if (Removing && MI == Regions[I].second) {
|
||||
Regions[I] = std::make_pair(Regions[I].first, std::prev(MI));
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -77,7 +77,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
|
|||
InitialSchedule,
|
||||
UnclusteredReschedule,
|
||||
ClusteredLowOccupancyReschedule,
|
||||
LastStage = ClusteredLowOccupancyReschedule
|
||||
PreRARematerialize,
|
||||
LastStage = PreRARematerialize
|
||||
};
|
||||
|
||||
const GCNSubtarget &ST;
|
||||
|
@ -110,24 +111,47 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
|
|||
// Record regions with high register pressure.
|
||||
BitVector RegionsWithHighRP;
|
||||
|
||||
// Regions that has the same occupancy as the latest MinOccupancy
|
||||
BitVector RegionsWithMinOcc;
|
||||
|
||||
// Region live-in cache.
|
||||
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
|
||||
|
||||
// Region pressure cache.
|
||||
SmallVector<GCNRegPressure, 32> Pressure;
|
||||
|
||||
// List of trivially rematerializable instructions we can remat to reduce RP.
|
||||
// First MI is the MI to remat and second MI is the position we should remat
|
||||
// before, usually the MI using the rematerializable instruction.
|
||||
SmallVector<std::pair<MachineInstr *, MachineInstr *>> RematerializableInsts;
|
||||
|
||||
// Temporary basic block live-in cache.
|
||||
DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
|
||||
|
||||
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
|
||||
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
|
||||
|
||||
// Collect all trivially rematerializable VGPR instructions with a single def
|
||||
// and single use outside the defining block into RematerializableInsts.
|
||||
void collectRematerializableInstructions(unsigned HighRPIdx);
|
||||
|
||||
bool isTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA);
|
||||
|
||||
// TODO: Should also attempt to reduce RP of SGPRs and AGPRs
|
||||
// Attempt to reduce RP of VGPR by sinking trivially rematerializable
|
||||
// instructions. Returns true if we were able to sink instruction(s).
|
||||
bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
|
||||
const TargetInstrInfo *TII, unsigned HighRPIdx);
|
||||
|
||||
// Return current region pressure.
|
||||
GCNRegPressure getRealRegPressure() const;
|
||||
|
||||
// Compute and cache live-ins and pressure for all regions in block.
|
||||
void computeBlockPressure(const MachineBasicBlock *MBB);
|
||||
|
||||
// Update region boundaries when removing MI or inserting NewMI before MI.
|
||||
void updateRegionBoundaries(MachineBasicBlock::iterator MI,
|
||||
MachineInstr *NewMI, bool Removing = false);
|
||||
|
||||
public:
|
||||
GCNScheduleDAGMILive(MachineSchedContext *C,
|
||||
|
|
|
@ -533,21 +533,19 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
|||
; GFX908-NEXT: s_lshl_b64 s[10:11], s[2:3], 5
|
||||
; GFX908-NEXT: s_or_b32 s10, s10, 28
|
||||
; GFX908-NEXT: v_rcp_iflag_f32_e32 v1, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v13, s10
|
||||
; GFX908-NEXT: v_mov_b32_e32 v11, s10
|
||||
; GFX908-NEXT: s_lshr_b32 s12, s7, 16
|
||||
; GFX908-NEXT: v_mov_b32_e32 v32, s11
|
||||
; GFX908-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
|
||||
; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v1
|
||||
; GFX908-NEXT: v_cvt_f32_f16_e32 v28, s7
|
||||
; GFX908-NEXT: v_cvt_f32_f16_e32 v29, s12
|
||||
; GFX908-NEXT: v_accvgpr_write_b32 a0, v13
|
||||
; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s7
|
||||
; GFX908-NEXT: v_cvt_f32_f16_e32 v27, s12
|
||||
; GFX908-NEXT: v_accvgpr_write_b32 a0, v11
|
||||
; GFX908-NEXT: v_mul_lo_u32 v1, s4, v2
|
||||
; GFX908-NEXT: v_accvgpr_write_b32 a1, v32
|
||||
; GFX908-NEXT: v_mov_b32_e32 v11, s3
|
||||
; GFX908-NEXT: s_lshl_b64 s[4:5], s[8:9], 5
|
||||
; GFX908-NEXT: v_mul_hi_u32 v3, v2, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v10, s2
|
||||
; GFX908-NEXT: v_add_u32_e32 v2, v2, v3
|
||||
; GFX908-NEXT: v_mul_hi_u32 v4, s0, v2
|
||||
; GFX908-NEXT: v_mul_lo_u32 v5, v4, s1
|
||||
|
@ -560,24 +558,26 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
|||
; GFX908-NEXT: v_add_u32_e32 v7, 1, v4
|
||||
; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s1, v5
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: v_and_b32_e32 v30, 0xffff, v0
|
||||
; GFX908-NEXT: v_and_b32_e32 v28, 0xffff, v0
|
||||
; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc
|
||||
; GFX908-NEXT: v_mul_lo_u32 v8, s9, v30
|
||||
; GFX908-NEXT: v_mul_hi_u32 v9, s8, v30
|
||||
; GFX908-NEXT: v_mul_lo_u32 v8, s9, v28
|
||||
; GFX908-NEXT: v_mul_hi_u32 v9, s8, v28
|
||||
; GFX908-NEXT: v_lshlrev_b64 v[2:3], 5, v[0:1]
|
||||
; GFX908-NEXT: v_mul_lo_u32 v6, s8, v30
|
||||
; GFX908-NEXT: v_mul_lo_u32 v6, s8, v28
|
||||
; GFX908-NEXT: v_add_u32_e32 v7, v9, v8
|
||||
; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
|
||||
; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
|
||||
; GFX908-NEXT: v_lshlrev_b64 v[6:7], 5, v[6:7]
|
||||
; GFX908-NEXT: v_mov_b32_e32 v9, s3
|
||||
; GFX908-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GFX908-NEXT: s_branch .LBB3_2
|
||||
; GFX908-NEXT: .LBB3_1: ; %bb12
|
||||
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX908-NEXT: v_add_co_u32_e32 v10, vcc, v10, v0
|
||||
; GFX908-NEXT: v_add_co_u32_e32 v8, vcc, v8, v0
|
||||
; GFX908-NEXT: s_nop 0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v3, a1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v5, a3
|
||||
; GFX908-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
|
||||
; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v4, a2
|
||||
; GFX908-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
|
||||
|
@ -591,79 +591,79 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
|||
; GFX908-NEXT: s_cbranch_scc0 .LBB3_1
|
||||
; GFX908-NEXT: ; %bb.3: ; %bb14
|
||||
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX908-NEXT: global_load_dwordx2 v[12:13], v[2:3], off
|
||||
; GFX908-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v11, 0
|
||||
; GFX908-NEXT: global_load_dwordx2 v[10:11], v[10:11], off
|
||||
; GFX908-NEXT: s_mov_b32 s7, s6
|
||||
; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[10:11]
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v15, a1
|
||||
; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[8:9]
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v13, a1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v15, s7
|
||||
; GFX908-NEXT: v_mov_b32_e32 v17, s7
|
||||
; GFX908-NEXT: v_mov_b32_e32 v19, s7
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v14, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v12, a0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v14, s6
|
||||
; GFX908-NEXT: v_mov_b32_e32 v16, s6
|
||||
; GFX908-NEXT: v_mov_b32_e32 v18, s6
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: v_add_co_u32_e32 v22, vcc, 1, v12
|
||||
; GFX908-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v13, vcc
|
||||
; GFX908-NEXT: v_mul_lo_u32 v23, s4, v20
|
||||
; GFX908-NEXT: v_mul_hi_u32 v24, s4, v22
|
||||
; GFX908-NEXT: v_mul_lo_u32 v25, s5, v22
|
||||
; GFX908-NEXT: v_mul_lo_u32 v31, s4, v22
|
||||
; GFX908-NEXT: v_add_co_u32_e32 v20, vcc, 1, v10
|
||||
; GFX908-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v11, vcc
|
||||
; GFX908-NEXT: v_mul_lo_u32 v21, s4, v18
|
||||
; GFX908-NEXT: v_mul_hi_u32 v22, s4, v20
|
||||
; GFX908-NEXT: v_mul_lo_u32 v23, s5, v20
|
||||
; GFX908-NEXT: v_mul_lo_u32 v29, s4, v20
|
||||
; GFX908-NEXT: v_mov_b32_e32 v19, s7
|
||||
; GFX908-NEXT: v_add_u32_e32 v20, v22, v21
|
||||
; GFX908-NEXT: v_add_u32_e32 v30, v20, v23
|
||||
; GFX908-NEXT: v_mov_b32_e32 v21, s7
|
||||
; GFX908-NEXT: v_add_u32_e32 v22, v24, v23
|
||||
; GFX908-NEXT: v_add_u32_e32 v33, v22, v25
|
||||
; GFX908-NEXT: v_mov_b32_e32 v23, s7
|
||||
; GFX908-NEXT: v_mov_b32_e32 v18, s6
|
||||
; GFX908-NEXT: v_mov_b32_e32 v20, s6
|
||||
; GFX908-NEXT: v_mov_b32_e32 v22, s6
|
||||
; GFX908-NEXT: s_branch .LBB3_5
|
||||
; GFX908-NEXT: .LBB3_4: ; %bb58
|
||||
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
|
||||
; GFX908-NEXT: v_add_co_u32_e32 v12, vcc, v12, v30
|
||||
; GFX908-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
|
||||
; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[12:13]
|
||||
; GFX908-NEXT: v_add_co_u32_e64 v14, s[2:3], v14, v6
|
||||
; GFX908-NEXT: v_addc_co_u32_e64 v15, s[2:3], v15, v7, s[2:3]
|
||||
; GFX908-NEXT: v_add_co_u32_e32 v10, vcc, v10, v28
|
||||
; GFX908-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
|
||||
; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
|
||||
; GFX908-NEXT: v_add_co_u32_e64 v12, s[2:3], v12, v6
|
||||
; GFX908-NEXT: v_addc_co_u32_e64 v13, s[2:3], v13, v7, s[2:3]
|
||||
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
|
||||
; GFX908-NEXT: .LBB3_5: ; %bb16
|
||||
; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1
|
||||
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; GFX908-NEXT: v_add_co_u32_e32 v24, vcc, v14, v31
|
||||
; GFX908-NEXT: v_addc_co_u32_e32 v25, vcc, v15, v33, vcc
|
||||
; GFX908-NEXT: global_load_dword v35, v[24:25], off offset:-12 glc
|
||||
; GFX908-NEXT: v_add_co_u32_e32 v22, vcc, v12, v29
|
||||
; GFX908-NEXT: v_addc_co_u32_e32 v23, vcc, v13, v30, vcc
|
||||
; GFX908-NEXT: global_load_dword v33, v[22:23], off offset:-12 glc
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: global_load_dword v34, v[24:25], off offset:-8 glc
|
||||
; GFX908-NEXT: global_load_dword v31, v[22:23], off offset:-8 glc
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: global_load_dword v26, v[24:25], off offset:-4 glc
|
||||
; GFX908-NEXT: global_load_dword v24, v[22:23], off offset:-4 glc
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: global_load_dword v24, v[24:25], off glc
|
||||
; GFX908-NEXT: global_load_dword v22, v[22:23], off glc
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: ds_read_b64 v[24:25], v1
|
||||
; GFX908-NEXT: ds_read_b64 v[26:27], v0
|
||||
; GFX908-NEXT: ds_read_b64 v[22:23], v1
|
||||
; GFX908-NEXT: ds_read_b64 v[24:25], v0
|
||||
; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: s_cbranch_vccnz .LBB3_4
|
||||
; GFX908-NEXT: ; %bb.6: ; %bb51
|
||||
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
|
||||
; GFX908-NEXT: v_cvt_f32_f16_sdwa v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX908-NEXT: v_cvt_f32_f16_e32 v9, v35
|
||||
; GFX908-NEXT: v_cvt_f32_f16_sdwa v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX908-NEXT: v_cvt_f32_f16_e32 v34, v34
|
||||
; GFX908-NEXT: v_add_f32_e32 v4, v28, v24
|
||||
; GFX908-NEXT: v_add_f32_e32 v5, v29, v25
|
||||
; GFX908-NEXT: v_add_f32_e32 v2, 0, v24
|
||||
; GFX908-NEXT: v_add_f32_e32 v3, 0, v25
|
||||
; GFX908-NEXT: v_add_f32_e32 v8, v8, v27
|
||||
; GFX908-NEXT: v_add_f32_e32 v9, v9, v26
|
||||
; GFX908-NEXT: v_add_f32_e32 v25, v35, v25
|
||||
; GFX908-NEXT: v_add_f32_e32 v24, v34, v24
|
||||
; GFX908-NEXT: v_add_f32_e32 v17, v17, v5
|
||||
; GFX908-NEXT: v_add_f32_e32 v16, v16, v4
|
||||
; GFX908-NEXT: v_add_f32_e32 v19, v19, v3
|
||||
; GFX908-NEXT: v_add_f32_e32 v18, v18, v2
|
||||
; GFX908-NEXT: v_add_f32_e32 v20, v20, v9
|
||||
; GFX908-NEXT: v_add_f32_e32 v21, v21, v8
|
||||
; GFX908-NEXT: v_add_f32_e32 v22, v22, v24
|
||||
; GFX908-NEXT: v_add_f32_e32 v23, v23, v25
|
||||
; GFX908-NEXT: v_cvt_f32_f16_sdwa v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX908-NEXT: v_cvt_f32_f16_e32 v33, v33
|
||||
; GFX908-NEXT: v_cvt_f32_f16_sdwa v35, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX908-NEXT: v_cvt_f32_f16_e32 v31, v31
|
||||
; GFX908-NEXT: v_add_f32_e32 v4, v26, v22
|
||||
; GFX908-NEXT: v_add_f32_e32 v5, v27, v23
|
||||
; GFX908-NEXT: v_add_f32_e32 v2, 0, v22
|
||||
; GFX908-NEXT: v_add_f32_e32 v3, 0, v23
|
||||
; GFX908-NEXT: v_add_f32_e32 v25, v34, v25
|
||||
; GFX908-NEXT: v_add_f32_e32 v24, v33, v24
|
||||
; GFX908-NEXT: v_add_f32_e32 v23, v35, v23
|
||||
; GFX908-NEXT: v_add_f32_e32 v22, v31, v22
|
||||
; GFX908-NEXT: v_add_f32_e32 v15, v15, v5
|
||||
; GFX908-NEXT: v_add_f32_e32 v14, v14, v4
|
||||
; GFX908-NEXT: v_add_f32_e32 v17, v17, v3
|
||||
; GFX908-NEXT: v_add_f32_e32 v16, v16, v2
|
||||
; GFX908-NEXT: v_add_f32_e32 v18, v18, v24
|
||||
; GFX908-NEXT: v_add_f32_e32 v19, v19, v25
|
||||
; GFX908-NEXT: v_add_f32_e32 v20, v20, v22
|
||||
; GFX908-NEXT: v_add_f32_e32 v21, v21, v23
|
||||
; GFX908-NEXT: s_branch .LBB3_4
|
||||
;
|
||||
; GFX90A-LABEL: introduced_copy_to_sgpr:
|
||||
|
|
|
@ -49,29 +49,29 @@ body: |
|
|||
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
|
||||
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: undef %11.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.1:
|
||||
; CHECK-NEXT: successors: %bb.2(0x80000000)
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec
|
||||
; CHECK-NEXT: dead undef %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, %18, 0, implicit $exec
|
||||
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
|
||||
; CHECK-NEXT: dead %12:vreg_64 = COPY [[DEF]]
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
|
||||
; CHECK-NEXT: dead %14:vgpr_32 = COPY [[DEF2]]
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub1
|
||||
; CHECK-NEXT: undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF3]].sub0, [[DEF5]].sub0, 0, implicit $exec
|
||||
; CHECK-NEXT: dead undef %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[DEF3]].sub1, [[DEF5]].sub1, %18, 0, implicit $exec
|
||||
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
|
||||
; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: dead %12:vreg_64 = COPY [[DEF8]]
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
|
||||
; CHECK-NEXT: dead %14:vgpr_32 = COPY [[DEF1]]
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub1
|
||||
; CHECK-NEXT: dead %16:vgpr_32 = COPY %11.sub0
|
||||
; CHECK-NEXT: dead %20:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF7]], implicit $exec
|
||||
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, implicit $exec :: (store (s64), addrspace 1)
|
||||
; CHECK-NEXT: dead %20:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF6]], implicit $exec
|
||||
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF7]], 288, 0, implicit $exec :: (store (s64), addrspace 1)
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.2:
|
||||
; CHECK-NEXT: successors: %bb.3(0x80000000)
|
||||
|
@ -81,7 +81,7 @@ body: |
|
|||
; CHECK-NEXT: bb.3:
|
||||
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: undef [[DEF5]].sub1:vreg_64 = COPY [[COPY3]]
|
||||
; CHECK-NEXT: undef [[DEF4]].sub1:vreg_64 = COPY [[COPY3]]
|
||||
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.4:
|
||||
|
@ -89,7 +89,7 @@ body: |
|
|||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: dead %21:sreg_64 = COPY $exec
|
||||
; CHECK-NEXT: dead %22:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY1]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
|
||||
; CHECK-NEXT: DBG_VALUE %22, $noreg
|
||||
; CHECK-NEXT: DBG_VALUE %22, $noreg, <0x{{[0-9a-f]+}}>, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !DILocation(line: 0, scope: <0x{{[0-9a-f]+}}>)
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.5:
|
||||
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
|
||||
|
|
|
@ -24,7 +24,7 @@ body: |
|
|||
; CHECK: bb.0:
|
||||
; CHECK-NEXT: successors: %bb.1(0x80000000)
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
|
@ -32,10 +32,9 @@ body: |
|
|||
; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: %9:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: %9:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF1]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.1:
|
||||
; CHECK-NEXT: successors: %bb.2(0x80000000)
|
||||
|
@ -51,33 +50,34 @@ body: |
|
|||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; CHECK-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]]
|
||||
; CHECK-NEXT: %16:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: %17:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: %16:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: %17:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: %18:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
|
||||
; CHECK-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: %21:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: %22:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: dead %23:vgpr_32 = nofpexcept V_MUL_F32_e32 %22, [[DEF13]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: %22:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: dead %23:vgpr_32 = nofpexcept V_MUL_F32_e32 %22, [[DEF12]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: dead [[V_MOV_B32_e32_1]]:vgpr_32 = nofpexcept V_MAC_F32_e32 %21, [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: [[DEF14:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF13:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: $sgpr4 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[DEF11]]
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[DEF10]]
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]]
|
||||
; CHECK-NEXT: $vgpr1 = COPY [[DEF7]]
|
||||
; CHECK-NEXT: $vgpr1 = COPY [[DEF6]]
|
||||
; CHECK-NEXT: $vgpr0 = COPY %16
|
||||
; CHECK-NEXT: $vgpr1 = COPY %17
|
||||
; CHECK-NEXT: $vgpr2 = COPY %18
|
||||
; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL [[DEF14]], @foo, csr_amdgpu_highregs, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0
|
||||
; CHECK-NEXT: %25:vgpr_32 = nofpexcept V_ADD_F32_e32 %9, [[DEF8]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: %25:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], %25, implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: dead %26:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: dead %27:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: dead %28:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, implicit $exec
|
||||
; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL [[DEF13]], @foo, csr_amdgpu_highregs, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0
|
||||
; CHECK-NEXT: %25:vgpr_32 = nofpexcept V_ADD_F32_e32 %9, [[DEF7]], implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: %25:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF11]], [[DEF8]], %25, implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: dead %26:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF3]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: dead %27:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: dead %28:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
|
||||
; CHECK-NEXT: [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF14]], [[DEF9]], 0, 0, implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -24,7 +24,6 @@ body: |
|
|||
; CHECK-NEXT: undef %0.sub3:vreg_512 = V_MOV_B32_e32 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[V_MOV_B32_e32_]], implicit $exec
|
||||
; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_512 = COPY %0
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.1:
|
||||
|
@ -39,10 +38,11 @@ body: |
|
|||
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def dead [[COPY1]], 851978 /* regdef:VGPR_LO16 */, def dead [[COPY]].sub1, 2147483657 /* reguse tiedto:$0 */, [[COPY1]], 2147549193 /* reguse tiedto:$1 */, [[COPY]].sub1
|
||||
; CHECK-NEXT: %11.sub0:vreg_512 = COPY [[COPY]].sub0
|
||||
; CHECK-NEXT: %11.sub3:vreg_512 = COPY [[COPY]].sub3
|
||||
; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; CHECK-NEXT: dead %10:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec
|
||||
; CHECK-NEXT: %11.sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
|
||||
; CHECK-NEXT: %11.sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_512 = COPY %11
|
||||
; CHECK-NEXT: dead %10:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec
|
||||
; CHECK-NEXT: S_BRANCH %bb.1
|
||||
bb.0:
|
||||
liveins: $sgpr6_sgpr7
|
||||
|
|
Loading…
Reference in New Issue