[AMDGPU] Add GFX11 feature for subtargets with more VGPRs

The full complement of physical VGPRs for GFX11 is 50% more than GFX10.
Some subtargets have this, others stay the same as GFX10. This affects
occupancy calculations.

Differential Revision: https://reviews.llvm.org/D134522
This commit is contained in:
Jay Foad 2022-09-23 09:02:49 +01:00
parent d2e434c378
commit ddfa0f62d8
8 changed files with 239 additions and 152 deletions

View File

@ -695,6 +695,12 @@ def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard",
"Does not need SW waitstates"
>;
def FeatureGFX11FullVGPRs : SubtargetFeature<"gfx11-full-vgprs",
"HasGFX11FullVGPRs",
"true",
"GFX11 with 50% more physical VGPRs and 50% larger allocation granule than GFX10"
>;
class SubtargetFeatureNSAMaxSize <int Value> : SubtargetFeature <
"nsa-max-size-"#Value,
"NSAMaxSize",
@ -1297,11 +1303,12 @@ def FeatureISAVersion11_Common : FeatureSet<
def FeatureISAVersion11_0_0 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureUserSGPRInit16Bug])>;
[FeatureGFX11FullVGPRs,
FeatureUserSGPRInit16Bug])>;
def FeatureISAVersion11_0_1 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[])>;
[FeatureGFX11FullVGPRs])>;
def FeatureISAVersion11_0_2 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,

View File

@ -192,6 +192,7 @@ protected:
bool HasFlatSegmentOffsetBug = false;
bool HasImageStoreD16Bug = false;
bool HasImageGather4D16Bug = false;
bool HasGFX11FullVGPRs = false;
bool HasVOPDInsts = false;
// Dummy feature to use for assembler in tablegen.
@ -1071,6 +1072,8 @@ public:
/// target.
bool hasNullExportTarget() const { return !GFX11Insts; }
bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; }
bool hasVOPDInsts() const { return HasVOPDInsts; }
bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }

View File

@ -844,6 +844,9 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
*EnableWavefrontSize32 :
STI->getFeatureBits().test(FeatureWavefrontSize32);
if (STI->getFeatureBits().test(FeatureGFX11FullVGPRs))
return IsWave32 ? 24 : 12;
if (hasGFX10_3Insts(*STI))
return IsWave32 ? 16 : 8;
@ -867,7 +870,10 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
return 512;
if (!isGFX10Plus(*STI))
return 256;
return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
bool IsWave32 = STI->getFeatureBits().test(FeatureWavefrontSize32);
if (STI->getFeatureBits().test(FeatureGFX11FullVGPRs))
return IsWave32 ? 1536 : 768;
return IsWave32 ? 1024 : 512;
}
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {

View File

@ -397,54 +397,57 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_b128 v[16:19], v[0:1], off
; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2
; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:32
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_dual_cndmask_b32 v17, v13, v15 :: v_dual_cndmask_b32 v16, v12, v14
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
; GFX11-NEXT: v_add_nc_u32_e32 v3, 1, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v3
; GFX11-NEXT: v_cndmask_b32_e64 v18, v12, v14, s0
; GFX11-NEXT: v_cndmask_b32_e64 v19, v13, v15, s0
; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:48
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v19, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v18 :: v_dual_add_nc_u32 v1, 1, v0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_dual_cndmask_b32 v1, v17, v5 :: v_dual_cndmask_b32 v0, v16, v4
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v3
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v4, v18, v4, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v19, v5, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v3
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v18, s0
; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v19, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_cndmask_b32 v3, v3, v7
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v8 :: v_dual_cndmask_b32 v3, v3, v9
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v7, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v3
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v3
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v12 :: v_dual_cndmask_b32 v3, v3, v13
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v3
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v14, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v3
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v15, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v15, s0

View File

@ -104,7 +104,10 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x8
; GFX11-NEXT: s_clause 0xa
; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128
; GFX11-NEXT: global_load_b128 v[4:7], v64, s[0:1] offset:144
; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:160
; GFX11-NEXT: global_load_b128 v[32:35], v64, s[0:1]
; GFX11-NEXT: global_load_b128 v[36:39], v64, s[0:1] offset:16
; GFX11-NEXT: global_load_b128 v[40:43], v64, s[0:1] offset:32
@ -113,42 +116,45 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
; GFX11-NEXT: global_load_b128 v[52:55], v64, s[0:1] offset:80
; GFX11-NEXT: global_load_b128 v[56:59], v64, s[0:1] offset:96
; GFX11-NEXT: global_load_b128 v[60:63], v64, s[0:1] offset:112
; GFX11-NEXT: global_load_b128 v[4:7], v64, s[0:1] offset:144
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: v_mov_b32_e32 v5, 0x3e7
; GFX11-NEXT: s_clause 0x6
; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:128
; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:160
; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:176
; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:192
; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:208
; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:224
; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:240
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:176
; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:192
; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:208
; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:224
; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:240
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:128
; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] offset:128
; GFX11-NEXT: global_store_b128 v64, v[4:7], s[2:3] offset:144
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:160
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:176
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: s_clause 0x8
; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] offset:192
; GFX11-NEXT: s_waitcnt vmcnt(13)
; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:160
; GFX11-NEXT: s_waitcnt vmcnt(12)
; GFX11-NEXT: global_store_b128 v64, v[32:35], s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(11)
; GFX11-NEXT: global_store_b128 v64, v[36:39], s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(10)
; GFX11-NEXT: global_store_b128 v64, v[40:43], s[2:3] offset:32
; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: global_store_b128 v64, v[44:47], s[2:3] offset:48
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: global_store_b128 v64, v[48:51], s[2:3] offset:64
; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: global_store_b128 v64, v[52:55], s[2:3] offset:80
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: global_store_b128 v64, v[56:59], s[2:3] offset:96
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: global_store_b128 v64, v[60:63], s[2:3] offset:112
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:176
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:192
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:208
; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:208
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:224
; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:224
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:240
; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:240
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -942,6 +942,7 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_mov_b32 s14, 0
; GFX11-NEXT: s_mov_b32 s15, 0x40200000
; GFX11-NEXT: s_mov_b64 s[0:1], 1.0
; GFX11-NEXT: s_mov_b32 s13, 0x401c0000
; GFX11-NEXT: s_mov_b32 s12, s14
; GFX11-NEXT: s_mov_b32 s11, 0x40180000
@ -952,38 +953,34 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; GFX11-NEXT: s_mov_b32 s5, 0x40080000
; GFX11-NEXT: s_mov_b32 s4, s14
; GFX11-NEXT: s_mov_b64 s[2:3], 2.0
; GFX11-NEXT: s_mov_b64 s[0:1], 1.0
; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12
; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10
; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8
; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6
; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4
; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2
; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 2, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 3, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 4, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v2
; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0
; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v0, s1
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s2
; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v1, s1
; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s2
; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6
; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s3
; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v1, s6
; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s3
; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, v0, s4
; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5
; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v1, s4
; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2
; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0
; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2
; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v12, v12, v1
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1
; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0
; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0
; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v0 :: v_dual_cndmask_b32 v16, v16, v1
; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b128 v[0:1], v[7:10], off dlc
@ -1172,29 +1169,29 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
; GFX11-NEXT: v_dual_mov_b32 v8, s7 :: v_dual_mov_b32 v7, s6
; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v5, s4
; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v0
; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 7, v0
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v0
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s18, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s19, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, s18, s1
; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s18, s0
; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s19, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v0
; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s18, s1
; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s19, s1
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v0
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, s18, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s19, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, s19, s1
; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s18, s0
; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, s19, s0
; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, s18, s1
; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, s19, s1
; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, s18, s2
; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, s19, s2
; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, s18, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, s19, vcc_lo
; GFX11-NEXT: global_store_b128 v[0:1], v[1:4], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b128 v[0:1], v[5:8], off dlc
@ -1584,36 +1581,33 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
; GFX11-NEXT: s_mov_b32 s12, s14
; GFX11-NEXT: s_mov_b32 s14, s16
; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12
; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10
; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8
; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6
; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4
; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2
; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 2, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 3, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 4, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v2
; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0
; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v0, s1
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s2
; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v1, s1
; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s2
; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6
; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s3
; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v1, s6
; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s3
; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, v0, s4
; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5
; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v1, s4
; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2
; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0
; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2
; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v12, v12, v1
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1
; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0
; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0
; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v0 :: v_dual_cndmask_b32 v16, v16, v1
; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b128 v[0:1], v[7:10], off dlc
@ -5181,33 +5175,35 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec,
; GFX11-LABEL: dyn_insertelement_v7f64_v_v_v:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v16
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v16
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v14, s0
; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v15, s0
; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v14, s1
; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v15, s1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v14 :: v_dual_cndmask_b32 v3, v3, v15
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v14 :: v_dual_cndmask_b32 v5, v5, v15
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16
; GFX11-NEXT: v_readfirstlane_b32 s10, v10
; GFX11-NEXT: v_readfirstlane_b32 s11, v11
; GFX11-NEXT: v_readfirstlane_b32 s4, v4
; GFX11-NEXT: v_readfirstlane_b32 s5, v5
; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v14 :: v_dual_cndmask_b32 v7, v7, v15
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16
; GFX11-NEXT: v_readfirstlane_b32 s12, v12
; GFX11-NEXT: v_readfirstlane_b32 s13, v13
; GFX11-NEXT: v_readfirstlane_b32 s6, v6
; GFX11-NEXT: v_readfirstlane_b32 s7, v7
; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v14 :: v_dual_cndmask_b32 v9, v9, v15
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16
; GFX11-NEXT: v_readfirstlane_b32 s8, v8
; GFX11-NEXT: v_readfirstlane_b32 s9, v9
; GFX11-NEXT: v_dual_cndmask_b32 v10, v10, v14 :: v_dual_cndmask_b32 v11, v11, v15
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16
; GFX11-NEXT: v_readfirstlane_b32 s10, v10
; GFX11-NEXT: v_readfirstlane_b32 s11, v11
; GFX11-NEXT: v_dual_cndmask_b32 v12, v12, v14 :: v_dual_cndmask_b32 v13, v13, v15
; GFX11-NEXT: v_readfirstlane_b32 s12, v12
; GFX11-NEXT: v_readfirstlane_b32 s13, v13
; GFX11-NEXT: ; return to shader part epilog
entry:
%insert = insertelement <7 x double> %vec, double %val, i32 %idx
@ -5629,26 +5625,27 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec,
; GFX11-LABEL: dyn_insertelement_v5f64_v_v_s:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s2, 4
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s2, 3
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s2, 4
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s0
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v10, s0
; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0
; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2
; GFX11-NEXT: v_readfirstlane_b32 s8, v8
; GFX11-NEXT: v_readfirstlane_b32 s9, v9
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s4, v4
; GFX11-NEXT: v_readfirstlane_b32 s5, v5
; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11
; GFX11-NEXT: v_readfirstlane_b32 s6, v6
; GFX11-NEXT: v_readfirstlane_b32 s7, v7
; GFX11-NEXT: v_readfirstlane_b32 s8, v8
; GFX11-NEXT: v_readfirstlane_b32 s4, v4
; GFX11-NEXT: v_readfirstlane_b32 s5, v5
; GFX11-NEXT: v_readfirstlane_b32 s9, v9
; GFX11-NEXT: ; return to shader part epilog
entry:
%insert = insertelement <5 x double> %vec, double %val, i32 %idx
@ -5717,26 +5714,27 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec,
; GFX11-LABEL: dyn_insertelement_v5f64_v_v_v:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v12
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v12
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 4, v12
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s0
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v10, s0
; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0
; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12
; GFX11-NEXT: v_readfirstlane_b32 s8, v8
; GFX11-NEXT: v_readfirstlane_b32 s9, v9
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12
; GFX11-NEXT: v_readfirstlane_b32 s4, v4
; GFX11-NEXT: v_readfirstlane_b32 s5, v5
; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11
; GFX11-NEXT: v_readfirstlane_b32 s6, v6
; GFX11-NEXT: v_readfirstlane_b32 s7, v7
; GFX11-NEXT: v_readfirstlane_b32 s8, v8
; GFX11-NEXT: v_readfirstlane_b32 s4, v4
; GFX11-NEXT: v_readfirstlane_b32 s5, v5
; GFX11-NEXT: v_readfirstlane_b32 s9, v9
; GFX11-NEXT: ; return to shader part epilog
entry:
%insert = insertelement <5 x double> %vec, double %val, i32 %idx

View File

@ -557,8 +557,8 @@ attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
; GFX10CU-WAVE64: NumVgprs: 128
; GFX11WGP-WAVE32: NumVgprs: 256
; GFX11WGP-WAVE64: NumVgprs: 256
; GFX11CU-WAVE32: NumVgprs: 128
; GFX11CU-WAVE64: NumVgprs: 128
; GFX11CU-WAVE32: NumVgprs: 192
; GFX11CU-WAVE64: NumVgprs: 192
define amdgpu_kernel void @f512() #512 {
call void @foo()
call void @use256vgprs()
@ -574,10 +574,10 @@ attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
; GFX10WGP-WAVE64: NumVgprs: 128
; GFX10CU-WAVE32: NumVgprs: 64
; GFX10CU-WAVE64: NumVgprs: 64
; GFX11WGP-WAVE32: NumVgprs: 128
; GFX11WGP-WAVE64: NumVgprs: 128
; GFX11CU-WAVE32: NumVgprs: 64
; GFX11CU-WAVE64: NumVgprs: 64
; GFX11WGP-WAVE32: NumVgprs: 192
; GFX11WGP-WAVE64: NumVgprs: 192
; GFX11CU-WAVE32: NumVgprs: 96
; GFX11CU-WAVE64: NumVgprs: 96
define amdgpu_kernel void @f1024() #1024 {
call void @foo()
call void @use256vgprs()

View File

@ -3,11 +3,18 @@
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1010,GFX1010W64 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1030,GFX1030W32 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1030,GFX1030W64 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W64 %s
; RUN: llc -march=amdgcn -mcpu=gfx1101 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s
; RUN: llc -march=amdgcn -mcpu=gfx1101 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W64 %s
; RUN: llc -march=amdgcn -mcpu=gfx1102 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1030,GFX1030W32 %s
; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1030,GFX1030W64 %s
; GCN-LABEL: {{^}}max_occupancy:
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @max_occupancy() {
ret void
}
@ -16,6 +23,8 @@ define amdgpu_kernel void @max_occupancy() {
; GFX9: ; Occupancy: 3
; GFX10W64: ; Occupancy: 3
; GFX10W32: ; Occupancy: 4
; GFX1100W64: ; Occupancy: 3
; GFX1100W32: ; Occupancy: 5
define amdgpu_kernel void @limited_occupancy_3() #0 {
ret void
}
@ -24,6 +33,7 @@ define amdgpu_kernel void @limited_occupancy_3() #0 {
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 18
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @limited_occupancy_18() #1 {
ret void
}
@ -32,6 +42,7 @@ define amdgpu_kernel void @limited_occupancy_18() #1 {
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 18
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @limited_occupancy_19() #2 {
ret void
}
@ -40,6 +51,7 @@ define amdgpu_kernel void @limited_occupancy_19() #2 {
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_24_vgprs() {
call void asm sideeffect "", "~{v23}" ()
ret void
@ -50,6 +62,7 @@ define amdgpu_kernel void @used_24_vgprs() {
; GFX1010W64: ; Occupancy: 18
; GFX1010W32: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_28_vgprs() {
call void asm sideeffect "", "~{v27}" ()
ret void
@ -60,6 +73,7 @@ define amdgpu_kernel void @used_28_vgprs() {
; GFX10W64: ; Occupancy: 16
; GFX1010W32: ; Occupancy: 20
; GFX1030W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_32_vgprs() {
call void asm sideeffect "", "~{v31}" ()
ret void
@ -71,6 +85,7 @@ define amdgpu_kernel void @used_32_vgprs() {
; GFX1010W32: ; Occupancy: 20
; GFX1030W64: ; Occupancy: 12
; GFX1030W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_36_vgprs() {
call void asm sideeffect "", "~{v35}" ()
ret void
@ -81,6 +96,7 @@ define amdgpu_kernel void @used_36_vgprs() {
; GFX10W64: ; Occupancy: 12
; GFX1010W32: ; Occupancy: 20
; GFX1030W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_40_vgprs() {
call void asm sideeffect "", "~{v39}" ()
ret void
@ -92,6 +108,7 @@ define amdgpu_kernel void @used_40_vgprs() {
; GFX1010W32: ; Occupancy: 20
; GFX1030W64: ; Occupancy: 10
; GFX1030W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_44_vgprs() {
call void asm sideeffect "", "~{v43}" ()
ret void
@ -102,6 +119,7 @@ define amdgpu_kernel void @used_44_vgprs() {
; GFX10W64: ; Occupancy: 10
; GFX1010W32: ; Occupancy: 20
; GFX1030W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_48_vgprs() {
call void asm sideeffect "", "~{v47}" ()
ret void
@ -112,6 +130,8 @@ define amdgpu_kernel void @used_48_vgprs() {
; GFX10W64: ; Occupancy: 9
; GFX1010W32: ; Occupancy: 18
; GFX1030W32: ; Occupancy: 16
; GFX1100W64: ; Occupancy: 12
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_56_vgprs() {
call void asm sideeffect "", "~{v55}" ()
ret void
@ -121,6 +141,8 @@ define amdgpu_kernel void @used_56_vgprs() {
; GFX9: ; Occupancy: 4
; GFX10W64: ; Occupancy: 8
; GFX10W32: ; Occupancy: 16
; GFX1100W64: ; Occupancy: 10
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_64_vgprs() {
call void asm sideeffect "", "~{v63}" ()
ret void
@ -131,6 +153,8 @@ define amdgpu_kernel void @used_64_vgprs() {
; GFX10W64: ; Occupancy: 7
; GFX1010W32: ; Occupancy: 14
; GFX1030W32: ; Occupancy: 12
; GFX1100W64: ; Occupancy: 10
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_72_vgprs() {
call void asm sideeffect "", "~{v71}" ()
ret void
@ -140,6 +164,8 @@ define amdgpu_kernel void @used_72_vgprs() {
; GFX9: ; Occupancy: 3
; GFX10W64: ; Occupancy: 6
; GFX10W32: ; Occupancy: 12
; GFX1100W64: ; Occupancy: 9
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_80_vgprs() {
call void asm sideeffect "", "~{v79}" ()
ret void
@ -151,6 +177,8 @@ define amdgpu_kernel void @used_80_vgprs() {
; GFX1010W32: ; Occupancy: 11
; GFX1030W64: ; Occupancy: 5
; GFX1030W32: ; Occupancy: 10
; GFX1100W64: ; Occupancy: 9
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_84_vgprs() {
call void asm sideeffect "", "~{v83}" ()
ret void
@ -161,6 +189,8 @@ define amdgpu_kernel void @used_84_vgprs() {
; GFX10W64: ; Occupancy: 5
; GFX1010W32: ; Occupancy: 11
; GFX1030W32: ; Occupancy: 10
; GFX1100W64: ; Occupancy: 8
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_88_vgprs() {
call void asm sideeffect "", "~{v87}" ()
ret void
@ -170,6 +200,8 @@ define amdgpu_kernel void @used_88_vgprs() {
; GFX9: ; Occupancy: 2
; GFX10W64: ; Occupancy: 5
; GFX10W32: ; Occupancy: 10
; GFX1100W64: ; Occupancy: 8
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_96_vgprs() {
call void asm sideeffect "", "~{v95}" ()
ret void
@ -180,6 +212,8 @@ define amdgpu_kernel void @used_96_vgprs() {
; GFX1010W64: ; Occupancy: 5
; GFX1030W64: ; Occupancy: 4
; GFX10W32: ; Occupancy: 9
; GFX1100W64: ; Occupancy: 7
; GFX1100W32: ; Occupancy: 12
define amdgpu_kernel void @used_100_vgprs() {
call void asm sideeffect "", "~{v99}" ()
ret void
@ -189,6 +223,8 @@ define amdgpu_kernel void @used_100_vgprs() {
; GFX9: ; Occupancy: 2
; GFX10W64: ; Occupancy: 4
; GFX10W32: ; Occupancy: 9
; GFX1100W64: ; Occupancy: 6
; GFX1100W32: ; Occupancy: 12
define amdgpu_kernel void @used_112_vgprs() {
call void asm sideeffect "", "~{v111}" ()
ret void
@ -198,6 +234,8 @@ define amdgpu_kernel void @used_112_vgprs() {
; GFX9: ; Occupancy: 2
; GFX10W64: ; Occupancy: 4
; GFX10W32: ; Occupancy: 8
; GFX1100W64: ; Occupancy: 5
; GFX1100W32: ; Occupancy: 10
define amdgpu_kernel void @used_128_vgprs() {
call void asm sideeffect "", "~{v127}" ()
ret void
@ -207,6 +245,8 @@ define amdgpu_kernel void @used_128_vgprs() {
; GFX9: ; Occupancy: 1
; GFX10W64: ; Occupancy: 3
; GFX10W32: ; Occupancy: 7
; GFX1100W64: ; Occupancy: 5
; GFX1100W32: ; Occupancy: 10
define amdgpu_kernel void @used_144_vgprs() {
call void asm sideeffect "", "~{v143}" ()
ret void
@ -217,6 +257,8 @@ define amdgpu_kernel void @used_144_vgprs() {
; GFX10W64: ; Occupancy: 3
; GFX1010W32: ; Occupancy: 6
; GFX1030W32: ; Occupancy: 5
; GFX1100W64: ; Occupancy: 4
; GFX1100W32: ; Occupancy: 9
define amdgpu_kernel void @used_168_vgprs() {
call void asm sideeffect "", "~{v167}" ()
ret void
@ -227,6 +269,8 @@ define amdgpu_kernel void @used_168_vgprs() {
; GFX10W64: ; Occupancy: 2
; GFX1010W32: ; Occupancy: 5
; GFX1030W32: ; Occupancy: 4
; GFX1100W64: ; Occupancy: 3
; GFX1100W32: ; Occupancy: 7
define amdgpu_kernel void @used_200_vgprs() {
call void asm sideeffect "", "~{v199}" ()
ret void
@ -236,6 +280,8 @@ define amdgpu_kernel void @used_200_vgprs() {
; GFX9: ; Occupancy: 1
; GFX10W64: ; Occupancy: 2
; GFX10W32: ; Occupancy: 4
; GFX1100W64: ; Occupancy: 2
; GFX1100W32: ; Occupancy: 5
define amdgpu_kernel void @used_256_vgprs() {
call void asm sideeffect "", "~{v255}" ()
ret void
@ -245,6 +291,7 @@ define amdgpu_kernel void @used_256_vgprs() {
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_80_sgprs() {
call void asm sideeffect "", "~{s79}" ()
ret void
@ -254,6 +301,7 @@ define amdgpu_kernel void @used_80_sgprs() {
; GFX9: ; Occupancy: 9
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_88_sgprs() {
call void asm sideeffect "", "~{s87}" ()
ret void
@ -263,6 +311,7 @@ define amdgpu_kernel void @used_88_sgprs() {
; GFX9: ; Occupancy: 8
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_100_sgprs() {
call void asm sideeffect "", "~{s99}" ()
ret void
@ -272,6 +321,7 @@ define amdgpu_kernel void @used_100_sgprs() {
; GFX9: ; Occupancy: 7
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_101_sgprs() {
call void asm sideeffect "", "~{s100}" ()
ret void
@ -281,6 +331,7 @@ define amdgpu_kernel void @used_101_sgprs() {
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
@lds6552 = internal addrspace(3) global [6552 x i8] undef, align 4
define amdgpu_kernel void @used_lds_6552() {
%p = bitcast [6552 x i8] addrspace(3)* @lds6552 to i8 addrspace(3)*
@ -292,6 +343,7 @@ define amdgpu_kernel void @used_lds_6552() {
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
@lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4
define amdgpu_kernel void @used_lds_6556() {
%p = bitcast [6556 x i8] addrspace(3)* @lds6556 to i8 addrspace(3)*
@ -303,6 +355,7 @@ define amdgpu_kernel void @used_lds_6556() {
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
@lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4
define amdgpu_kernel void @used_lds_13112() {
%p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)*
@ -314,6 +367,8 @@ define amdgpu_kernel void @used_lds_13112() {
; GFX9: ; Occupancy: 7{{$}}
; GFX10W64: ; Occupancy: 7{{$}}
; GFX10W32: ; Occupancy: 14{{$}}
; GFX1100W64: ; Occupancy: 7{{$}}
; GFX1100W32: ; Occupancy: 14{{$}}
@lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4
define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
@ -326,6 +381,8 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
; GFX10W64: ; Occupancy: 14{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
; GFX1030W32: ; Occupancy: 16{{$}}
; GFX1100W64: ; Occupancy: 14{{$}}
; GFX1100W32: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
@ -337,6 +394,8 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
; GFX10W64: ; Occupancy: 14{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
; GFX1030W32: ; Occupancy: 16{{$}}
; GFX1100W64: ; Occupancy: 14{{$}}
; GFX1100W32: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
@ -347,6 +406,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
; GFX9: ; Occupancy: 10{{$}}
; GFX1010: ; Occupancy: 20{{$}}
; GFX1030: ; Occupancy: 16{{$}}
; GFX1100: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
@ -357,6 +417,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
; GFX9: ; Occupancy: 10{{$}}
; GFX1010: ; Occupancy: 20{{$}}
; GFX1030: ; Occupancy: 16{{$}}
; GFX1100: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
@ -367,6 +428,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
; GFX9: ; Occupancy: 10{{$}}
; GFX1010: ; Occupancy: 20{{$}}
; GFX1030: ; Occupancy: 16{{$}}
; GFX1100: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
@ -377,6 +439,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
; GFX9: ; Occupancy: 10{{$}}
; GFX1010: ; Occupancy: 20{{$}}
; GFX1030: ; Occupancy: 16{{$}}
; GFX1100: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
@ -386,6 +449,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32:
; GFX9: ; Occupancy: 7{{$}}
; GFX10: ; Occupancy: 7{{$}}
; GFX1100: ; Occupancy: 7{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p