[AMDGPU] Fix too many constants with flat scratch

Prevent SIFoldOperands from creating SALU instructions with a constant
and a frame index. Previously, only one operand was checked to be a
frame index, leading to too many constants when flat scratch is enabled
and stack offsets are large.

Differential Revision: https://reviews.llvm.org/D108368
This commit is contained in:
Sebastian Neubauer 2021-08-19 13:55:03 +02:00
parent fa4132dc88
commit f3fe44fa05
2 changed files with 122 additions and 1 deletions

View File

@ -452,7 +452,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
const SIRegisterInfo &SRI = TII->getRegisterInfo();
// Fine if the operand can be encoded as an inline constant
if (OpToFold->isImm()) {
if (TII->isLiteralConstantLike(*OpToFold, OpInfo)) {
if (!SRI.opCanUseInlineConstant(OpInfo.OperandType) ||
!TII->isInlineConstant(*OpToFold, OpInfo)) {
// Otherwise check for another constant

View File

@ -2719,5 +2719,126 @@ bb:
ret void
}
define amdgpu_ps void @large_offset() {
; GFX9-LABEL: large_offset:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s2
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: v_mov_b32_e32 v3, v0
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 16
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use v0
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: v_mov_b32_e32 v0, 0x810
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use v0
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: large_offset:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_add_u32 s0, s0, s2
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_movk_i32 s0, 0x810
; GFX10-NEXT: s_addk_i32 s0, 0x3c0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 16
; GFX10-NEXT: v_mov_b32_e32 v1, 0x810
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use v0
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use v1
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: large_offset:
; GFX9-PAL: ; %bb.0: ; %bb
; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
; GFX9-PAL-NEXT: s_mov_b32 s2, s0
; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-PAL-NEXT: v_mov_b32_e32 v1, v0
; GFX9-PAL-NEXT: v_mov_b32_e32 v2, v0
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, v0
; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 16
; GFX9-PAL-NEXT: ;;#ASMSTART
; GFX9-PAL-NEXT: ; use v0
; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x810
; GFX9-PAL-NEXT: ;;#ASMSTART
; GFX9-PAL-NEXT: ; use v0
; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX10-PAL-LABEL: large_offset:
; GFX10-PAL: ; %bb.0: ; %bb
; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
; GFX10-PAL-NEXT: s_mov_b32 s2, s0
; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
; GFX10-PAL-NEXT: s_add_u32 s2, s2, s0
; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-PAL-NEXT: s_movk_i32 s0, 0x810
; GFX10-PAL-NEXT: s_addk_i32 s0, 0x3c0
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0
; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 16
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x810
; GFX10-PAL-NEXT: ;;#ASMSTART
; GFX10-PAL-NEXT: ; use v0
; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: ;;#ASMSTART
; GFX10-PAL-NEXT: ; use v1
; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: s_endpgm
bb:
%alloca = alloca [128 x <4 x i32>], align 16, addrspace(5)
%alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5)
%gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60
store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16
%load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16
call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0
call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0
ret void
}
declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
declare i32 @llvm.amdgcn.workitem.id.x()