forked from OSchip/llvm-project
211 lines
8.0 KiB
LLVM
211 lines
8.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX9 %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX10 %s
|
|
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX9 %s
|
|
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX10 %s
|
|
|
|
; Test case looks at the allocated offset of @used_by_both. It's at zero when
|
|
; allocated by itself, but at 8 when allocated in combination with the double.
|
|
; Redundantly also checks LDSByteSize.
|
|
@used_by_both = addrspace(3) global i32 undef
|
|
@used_by_kernel = addrspace(3) global i32 undef
|
|
@used_by_function = addrspace(3) global double undef
|
|
|
|
; kernel that calls no functions and uses an LDS variable allocates only that
|
|
; variable, so accesses at at offset 0 and LDSByteSize is 4
|
|
define amdgpu_kernel void @nocall_ideal() {
|
|
; CHECK-LABEL: nocall_ideal:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: ds_write_b32 v0, v0
|
|
; CHECK-NEXT: s_endpgm
|
|
store i32 0, i32 addrspace(3)* @used_by_kernel
|
|
ret void
|
|
}
|
|
; CHECK: ; LDSByteSize: 4 bytes
|
|
|
|
; Needs to allocate both variables, store to used_by_both is at sizeof(double)
|
|
define amdgpu_kernel void @withcall() {
|
|
; GFX9-LABEL: withcall:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
|
; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
|
|
; GFX9-NEXT: s_mov_b32 s10, -1
|
|
; GFX9-NEXT: s_mov_b32 s11, 0xe00000
|
|
; GFX9-NEXT: s_add_u32 s8, s8, s3
|
|
; GFX9-NEXT: s_addc_u32 s9, s9, 0
|
|
; GFX9-NEXT: s_getpc_b64 s[2:3]
|
|
; GFX9-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4
|
|
; GFX9-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12
|
|
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
|
|
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], s[8:9]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; GFX9-NEXT: s_mov_b32 s32, 0
|
|
; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: withcall:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
|
; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
|
|
; GFX10-NEXT: s_mov_b32 s10, -1
|
|
; GFX10-NEXT: s_mov_b32 s11, 0x31c16000
|
|
; GFX10-NEXT: s_add_u32 s8, s8, s3
|
|
; GFX10-NEXT: s_addc_u32 s9, s9, 0
|
|
; GFX10-NEXT: s_getpc_b64 s[2:3]
|
|
; GFX10-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4
|
|
; GFX10-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12
|
|
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
|
|
; GFX10-NEXT: s_mov_b64 s[6:7], s[0:1]
|
|
; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9]
|
|
; GFX10-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; GFX10-NEXT: s_mov_b32 s32, 0
|
|
; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; G_GFX9-LABEL: withcall:
|
|
; G_GFX9: ; %bb.0:
|
|
; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
|
; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
|
|
; G_GFX9-NEXT: s_mov_b32 s10, -1
|
|
; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000
|
|
; G_GFX9-NEXT: s_add_u32 s8, s8, s3
|
|
; G_GFX9-NEXT: s_addc_u32 s9, s9, 0
|
|
; G_GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
|
|
; G_GFX9-NEXT: s_getpc_b64 s[0:1]
|
|
; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
|
|
; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
|
|
; G_GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
|
; G_GFX9-NEXT: s_mov_b64 s[0:1], s[8:9]
|
|
; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
|
|
; G_GFX9-NEXT: v_mov_b32_e32 v1, 8
|
|
; G_GFX9-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; G_GFX9-NEXT: s_mov_b32 s32, 0
|
|
; G_GFX9-NEXT: ds_write_b32 v1, v0
|
|
; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; G_GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; G_GFX9-NEXT: s_endpgm
|
|
;
|
|
; G_GFX10-LABEL: withcall:
|
|
; G_GFX10: ; %bb.0:
|
|
; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
|
; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
|
|
; G_GFX10-NEXT: s_mov_b32 s10, -1
|
|
; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000
|
|
; G_GFX10-NEXT: s_add_u32 s8, s8, s3
|
|
; G_GFX10-NEXT: s_addc_u32 s9, s9, 0
|
|
; G_GFX10-NEXT: s_mov_b64 s[6:7], s[0:1]
|
|
; G_GFX10-NEXT: s_getpc_b64 s[0:1]
|
|
; G_GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
|
|
; G_GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
|
|
; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
|
|
; G_GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
|
; G_GFX10-NEXT: v_mov_b32_e32 v1, 8
|
|
; G_GFX10-NEXT: s_mov_b64 s[0:1], s[8:9]
|
|
; G_GFX10-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; G_GFX10-NEXT: s_mov_b32 s32, 0
|
|
; G_GFX10-NEXT: ds_write_b32 v1, v0
|
|
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; G_GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; G_GFX10-NEXT: s_endpgm
|
|
store i32 0, i32 addrspace(3)* @used_by_both
|
|
call void @nonkernel()
|
|
ret void
|
|
}
|
|
; CHECK: ; LDSByteSize: 16 bytes
|
|
|
|
; Kernel only needs to allocate the i32 it uses, but because that i32 was
|
|
; also used by a non-kernel function it was block allocated along with
|
|
; the double used by the non-kernel function, this kernel allocates 16 bytes
|
|
; and the accesses to the integer are at offset 8
|
|
define amdgpu_kernel void @nocall_false_sharing() {
|
|
; GFX9-LABEL: nocall_false_sharing:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: nocall_false_sharing:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; G_GFX9-LABEL: nocall_false_sharing:
|
|
; G_GFX9: ; %bb.0:
|
|
; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
|
|
; G_GFX9-NEXT: v_mov_b32_e32 v1, 8
|
|
; G_GFX9-NEXT: ds_write_b32 v1, v0
|
|
; G_GFX9-NEXT: s_endpgm
|
|
;
|
|
; G_GFX10-LABEL: nocall_false_sharing:
|
|
; G_GFX10: ; %bb.0:
|
|
; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
|
|
; G_GFX10-NEXT: v_mov_b32_e32 v1, 8
|
|
; G_GFX10-NEXT: ds_write_b32 v1, v0
|
|
; G_GFX10-NEXT: s_endpgm
|
|
store i32 0, i32 addrspace(3)* @used_by_both
|
|
ret void
|
|
}
|
|
; CHECK: ; LDSByteSize: 16 bytes
|
|
|
|
|
|
|
|
define void @nonkernel() {
|
|
; GFX9-LABEL: nonkernel:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
|
|
; GFX9-NEXT: ds_write_b64 v0, v[0:1]
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: nonkernel:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX10-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
|
|
; GFX10-NEXT: ds_write_b64 v0, v[0:1]
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; G_GFX9-LABEL: nonkernel:
|
|
; G_GFX9: ; %bb.0:
|
|
; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; G_GFX9-NEXT: v_mov_b32_e32 v2, 0
|
|
; G_GFX9-NEXT: v_mov_b32_e32 v3, 8
|
|
; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
|
|
; G_GFX9-NEXT: v_mov_b32_e32 v1, 0
|
|
; G_GFX9-NEXT: ds_write_b32 v3, v2
|
|
; G_GFX9-NEXT: ds_write_b64 v2, v[0:1]
|
|
; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; G_GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; G_GFX10-LABEL: nonkernel:
|
|
; G_GFX10: ; %bb.0:
|
|
; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; G_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
|
|
; G_GFX10-NEXT: v_mov_b32_e32 v3, 8
|
|
; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
|
|
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
|
|
; G_GFX10-NEXT: ds_write_b32 v3, v2
|
|
; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
|
|
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; G_GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
store i32 0, i32 addrspace(3)* @used_by_both
|
|
store double 0.0, double addrspace(3)* @used_by_function
|
|
ret void
|
|
}
|