enable code-object-version=5

This commit is contained in:
Ron Lieberman 2022-11-29 14:12:05 -06:00
parent 50921a2174
commit d882ba7aea
58 changed files with 5488 additions and 5555 deletions

View File

@ -3690,12 +3690,12 @@ defm amdgpu_ieee : BoolOption<"m", "amdgpu-ieee",
NegFlag<SetFalse, [CC1Option]>>, Group<m_Group>;
def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group<m_Group>,
HelpText<"Specify code object ABI version. Defaults to 4. (AMDGPU only)">,
HelpText<"Specify code object ABI version. Defaults to 5. (AMDGPU only)">,
Flags<[CC1Option]>,
Values<"none,2,3,4,5">,
NormalizedValuesScope<"TargetOptions">,
NormalizedValues<["COV_None", "COV_2", "COV_3", "COV_4", "COV_5"]>,
MarshallingInfoEnum<TargetOpts<"CodeObjectVersion">, "COV_4">;
MarshallingInfoEnum<TargetOpts<"CodeObjectVersion">, "COV_5">;
defm code_object_v3_legacy : SimpleMFlag<"code-object-v3",
"Legacy option to specify code object ABI V3",

View File

@ -2157,7 +2157,7 @@ void tools::checkAMDGPUCodeObjectVersion(const Driver &D,
unsigned tools::getAMDGPUCodeObjectVersion(const Driver &D,
const llvm::opt::ArgList &Args) {
unsigned CodeObjVer = 4; // default
unsigned CodeObjVer = 5; // default
if (auto *CodeObjArg = getAMDGPUCodeObjectArgument(D, Args)) {
if (CodeObjArg->getOption().getID() ==
options::OPT_mno_code_object_v3_legacy) {

View File

@ -1,7 +1,7 @@
// Create module flag for code object version.
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
// RUN: -o - %s | FileCheck %s -check-prefix=V4
// RUN: -o - %s | FileCheck %s -check-prefix=V5
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
// RUN: -mcode-object-version=2 -o - %s | FileCheck -check-prefix=V2 %s

View File

@ -1,5 +1,5 @@
// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn-amd-amdhsa \
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
// RUN: -fcuda-is-device -mcode-object-version=4 -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefix=PRECOV5 %s

View File

@ -583,13 +583,13 @@ void test_get_local_id(int d, global int *out)
}
// CHECK-LABEL: @test_get_workgroup_size(
// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 4
// CHECK: call align 8 dereferenceable(256) i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 12
// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load
// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 6
// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 14
// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 8
// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load
// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 16
// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 8, !range [[$WS_RANGE:![0-9]*]], !invariant.load
void test_get_workgroup_size(int d, global int *out)
{
switch (d) {

View File

@ -139,13 +139,13 @@
// Test default code object version.
// RUN: %clang -### --target=x86_64-linux-gnu --offload-arch=gfx900 \
// RUN: --rocm-path=%S/Inputs/rocm %S/Inputs/hip_multiple_inputs/b.hip \
// RUN: 2>&1 | FileCheck %s --check-prefixes=ABI4
// RUN: 2>&1 | FileCheck %s --check-prefixes=ABI5
// Test default code object version with old device library without abi_version_400.bc
// RUN: %clang -### --target=x86_64-linux-gnu --offload-arch=gfx900 \
// RUN: --hip-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode-no-abi-ver \
// RUN: --rocm-path=%S/Inputs/rocm %S/Inputs/hip_multiple_inputs/b.hip \
// RUN: 2>&1 | FileCheck %s --check-prefixes=NOABI4
// RUN: 2>&1 | FileCheck %s --check-prefixes=NOABI5
// Test -mcode-object-version=3
// RUN: %clang -### --target=x86_64-linux-gnu --offload-arch=gfx900 \

View File

@ -13,7 +13,7 @@
# CHECK-NEXT: DataEncoding: LittleEndian (0x1)
# CHECK-NEXT: FileVersion: 1
# CHECK-NEXT: OS/ABI: AMDGPU_HSA (0x40)
# CHECK-NEXT: ABIVersion: 2
# CHECK-NEXT: ABIVersion: 3
# CHECK-NEXT: Unused: (00 00 00 00 00 00 00)
# CHECK-NEXT: }
# CHECK-NEXT: Type: Executable (0x2)

View File

@ -15,7 +15,7 @@
; RUN: llvm-readobj --file-headers %t/mesa3d.so | FileCheck %s --check-prefixes=GCN,NON-AMDHSA,MESA3D
; AMDHSA: OS/ABI: AMDGPU_HSA (0x40)
; AMDHSA: ABIVersion: 2
; AMDHSA: ABIVersion: 3
; AMDPAL: OS/ABI: AMDGPU_PAL (0x41)
; MESA3D: OS/ABI: AMDGPU_MESA3D (0x42)

View File

@ -32,7 +32,7 @@
static llvm::cl::opt<unsigned>
AmdhsaCodeObjectVersion("amdhsa-code-object-version", llvm::cl::Hidden,
llvm::cl::desc("AMDHSA Code Object Version"),
llvm::cl::init(4));
llvm::cl::init(5));
namespace {

View File

@ -7,43 +7,42 @@ declare void @callee()
define amdgpu_kernel void @call_debug_loc() {
; CHECK-LABEL: name: call_debug_loc
; CHECK: bb.1.entry:
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2, debug-location !6
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1, debug-location !6
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0, debug-location !6
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16, debug-location !6
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15, debug-location !6
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14, debug-location !6
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11, debug-location !6
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7, debug-location !6
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5, debug-location !6
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14, debug-location !6
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13, debug-location !6
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12, debug-location !6
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9, debug-location !6
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5, debug-location !6
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, debug-location !6
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[COPY8]], debug-location !6
; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[COPY7]], debug-location !6
; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[COPY6]], debug-location !6
; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY5]], debug-location !6
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]], debug-location !6
; CHECK-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY3]], debug-location !6
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF debug-location !6
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[COPY7]], debug-location !6
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF debug-location !6
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[COPY6]], debug-location !6
; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY5]], debug-location !6
; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY4]], debug-location !6
; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY3]], debug-location !6
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF debug-location !6
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10
; CHECK-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[COPY1]], implicit $exec, debug-location !6
; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY14]], [[COPY1]], implicit $exec, debug-location !6
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 20
; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY17]], [[COPY]], implicit $exec, debug-location !6
; CHECK-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[COPY]], implicit $exec, debug-location !6
; CHECK-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 [[COPY2]], [[V_LSHLREV_B32_e64_]], [[V_LSHLREV_B32_e64_1]], implicit $exec, debug-location !6
; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3, debug-location !6
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]], debug-location !6
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]], debug-location !6
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]], debug-location !6
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY9]], debug-location !6
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY12]], debug-location !6
; CHECK-NEXT: $sgpr12 = COPY [[COPY13]], debug-location !6
; CHECK-NEXT: $sgpr13 = COPY [[COPY14]], debug-location !6
; CHECK-NEXT: $sgpr14 = COPY [[COPY15]], debug-location !6
; CHECK-NEXT: $sgpr15 = COPY [[DEF]], debug-location !6
; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3, debug-location !6
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]], debug-location !6
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]], debug-location !6
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]], debug-location !6
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY8]], debug-location !6
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]], debug-location !6
; CHECK-NEXT: $sgpr12 = COPY [[COPY11]], debug-location !6
; CHECK-NEXT: $sgpr13 = COPY [[COPY12]], debug-location !6
; CHECK-NEXT: $sgpr14 = COPY [[COPY13]], debug-location !6
; CHECK-NEXT: $sgpr15 = COPY [[DEF1]], debug-location !6
; CHECK-NEXT: $vgpr31 = COPY [[V_OR3_B32_e64_]], debug-location !6
; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc, debug-location !6
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0, debug-location !6 :: (dereferenceable invariant load (p0) from got, addrspace 4)

View File

@ -32,13 +32,13 @@ define void @call_result_align_1() {
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @returns_ptr
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@ -81,13 +81,13 @@ define void @call_result_align_8() {
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @returns_ptr
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@ -131,13 +131,13 @@ define void @declaration_result_align_8() {
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @returns_ptr_align8
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@ -181,11 +181,11 @@ define ptr addrspace(1) @tail_call_assert_align() {
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @returns_ptr_align8
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]

View File

@ -9,37 +9,36 @@ declare hidden void @extern()
define amdgpu_kernel void @kernel_call_no_workitem_ids() {
; CHECK-LABEL: name: kernel_call_no_workitem_ids
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr14
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr13
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]](p4)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C]](s64)
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY3]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]]
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY8]](p4)
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY7]], [[C]](s64)
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY [[COPY3]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY]]
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY6]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY11]](s32)
; CHECK-NEXT: $sgpr13 = COPY [[COPY12]](s32)
; CHECK-NEXT: $sgpr14 = COPY [[COPY13]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY8]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY9]](s32)
; CHECK-NEXT: $sgpr13 = COPY [[COPY10]](s32)
; CHECK-NEXT: $sgpr14 = COPY [[COPY11]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @extern, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0
@ -50,40 +49,39 @@ define amdgpu_kernel void @kernel_call_no_workitem_ids() {
define amdgpu_kernel void @kernel_call_no_workgroup_ids() {
; CHECK-LABEL: name: kernel_call_no_workgroup_ids
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]](p4)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C]](s64)
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY3]]
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY7]], [[C]](s64)
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY [[COPY3]]
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY12]], [[C1]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY11]], [[SHL]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[C1]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY9]], [[SHL]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C2]](s32)
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY8]](p4)
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY6]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64)
; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY8]](s64)
; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @extern, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@ -95,12 +93,12 @@ define amdgpu_kernel void @kernel_call_no_workgroup_ids() {
define amdgpu_kernel void @kernel_call_no_other_sgprs() {
; CHECK-LABEL: name: kernel_call_no_other_sgprs
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr8_sgpr9
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr6_sgpr7
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(p4) = COPY [[COPY3]](p4)
@ -139,12 +137,12 @@ define void @func_call_no_workitem_ids() {
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
@ -177,12 +175,12 @@ define void @func_call_no_workgroup_ids() {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY4]](p4)
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY3]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]]

View File

@ -6,18 +6,17 @@ declare hidden void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(p
define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
; GCN-LABEL: name: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32
; GCN: bb.1 (%ir-block.1):
; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
; GCN-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@ -30,24 +29,24 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; GCN-NEXT: G_STORE [[C1]](s32), [[PTR_ADD]](p5) :: (store (s32) into %ir.in.gep1, addrspace 5)
; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32
; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64)
; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64)
; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32)
; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32)
; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; GCN-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32)
; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32)
; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; GCN-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; GCN-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@ -55,16 +54,16 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; GCN-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; GCN-NEXT: G_MEMCPY [[PTR_ADD2]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store (s64) into stack, align 4, addrspace 5), (dereferenceable load (s64) from %ir.in.val, align 4, addrspace 5)
; GCN-NEXT: $vgpr0 = COPY [[FRAME_INDEX1]](p5)
; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4)
; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4)
; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64)
; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32)
; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32)
; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32)
; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64)
; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32)
; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32)
; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32)
; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32)
; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32)
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; GCN-NEXT: ADJCALLSTACKDOWN 0, 8, implicit-def $scc

File diff suppressed because it is too large Load Diff

View File

@ -4,50 +4,49 @@
define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) {
; CHECK-LABEL: name: test_indirect_call_sgpr_ptr
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p0) from %ir.fptr.kernarg.offset1, align 16, addrspace 4)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32)
; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32)
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32)
; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32)
; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[LOAD]](p0), 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc

View File

@ -816,7 +816,7 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1
@ -933,7 +933,7 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
; GCN-NEXT: G_STORE [[C1]](s64), [[PTR_ADD2]](p5) :: (store (s64) into %ir.alloca1 + 8, addrspace 5)
; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @void_fastcc_multi_byval
; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; GCN-NEXT: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; GCN-NEXT: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@ -984,7 +984,7 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1
@ -1097,7 +1097,7 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64
; GCN-NEXT: G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store (s32) into %ir.alloca + 8, addrspace 5)
; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @void_fastcc_byval_and_stack_passed
; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; GCN-NEXT: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; GCN-NEXT: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@ -1176,14 +1176,14 @@ define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32)
; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i64_fastcc_i64
; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@ -1225,14 +1225,14 @@ define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspa
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32)
; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @p1i8_fastcc_p1i8
; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@ -1274,13 +1274,13 @@ define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i16_fastcc_i16
; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@ -1321,13 +1321,13 @@ define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @f16_fastcc_f16
; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@ -1368,7 +1368,7 @@ define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
; GCN-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@ -1377,7 +1377,7 @@ define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1
; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16)
; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @v3i16_fastcc_v3i16
; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@ -1422,14 +1422,14 @@ define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
; GCN-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>)
; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @v4i16_fastcc_v4i16
; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
@ -1471,7 +1471,7 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1
@ -1482,7 +1482,7 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1
; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @v2i64_fastcc_v2i64
; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; GCN-NEXT: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; GCN-NEXT: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; GCN-NEXT: [[COPY15:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY4]]

View File

@ -15,11 +15,11 @@ define void @tail_call_void_func_void() {
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_void_func_void
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4)
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]

View File

@ -12,9 +12,11 @@ define void @func_use_lds_global() {
; GFX8-LABEL: func_use_lds_global:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_trap 2
; GFX8-NEXT: ds_write_b32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@ -37,7 +39,9 @@ define void @func_use_lds_global_constexpr_cast() {
; GFX8-LABEL: func_use_lds_global_constexpr_cast:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_trap 2
; GFX8-NEXT: flat_store_dword v[0:1], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)

View File

@ -209,19 +209,16 @@ body: |
liveins: $vgpr0
; VI-LABEL: name: test_addrspacecast_p5_to_p0
; VI: liveins: $vgpr0, $sgpr4_sgpr5
; VI: liveins: $vgpr0
; VI-NEXT: {{ $}}
; VI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; VI-NEXT: [[COPY1:%[0-9]+]]:_(p5) = COPY $vgpr0
; VI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C]](s64)
; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p5)
; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
; VI-NEXT: [[C:%[0-9]+]]:_(p4) = G_CONSTANT i64 228
; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[C]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5)
; VI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
; VI-NEXT: [[C1:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
; VI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](p5), [[C1]]
; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p5), [[C1]]
; VI-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]]
; VI-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](p0)
; GFX9-LABEL: name: test_addrspacecast_p5_to_p0
@ -304,19 +301,16 @@ body: |
liveins: $vgpr0
; VI-LABEL: name: test_addrspacecast_p3_to_p0
; VI: liveins: $vgpr0, $sgpr4_sgpr5
; VI: liveins: $vgpr0
; VI-NEXT: {{ $}}
; VI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; VI-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr0
; VI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C]](s64)
; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4)
; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p3)
; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
; VI-NEXT: [[C:%[0-9]+]]:_(p4) = G_CONSTANT i64 232
; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[C]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3)
; VI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
; VI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
; VI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](p3), [[C1]]
; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p3), [[C1]]
; VI-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]]
; VI-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](p0)
; GFX9-LABEL: name: test_addrspacecast_p3_to_p0
@ -539,24 +533,20 @@ body: |
liveins: $vgpr0_vgpr1
; VI-LABEL: name: test_addrspacecast_v2p3_to_v2p0
; VI: liveins: $vgpr0_vgpr1, $sgpr4_sgpr5
; VI: liveins: $vgpr0_vgpr1
; VI-NEXT: {{ $}}
; VI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1
; VI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>)
; VI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C]](s64)
; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4)
; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1
; VI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>)
; VI-NEXT: [[C:%[0-9]+]]:_(p4) = G_CONSTANT i64 232
; VI-NEXT: [[COPY1:%[0-9]+]]:_(p4) = COPY [[C]](p4)
; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
; VI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
; VI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
; VI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C1]]
; VI-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]]
; VI-NEXT: [[COPY3:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY3]], [[C]](s64)
; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4)
; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[C]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
; VI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
; VI-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[LOAD1]](s32)
; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C1]]
@ -764,18 +754,13 @@ stack:
body: |
bb.0:
; VI-LABEL: name: test_addrspacecast_p5_fi_to_p0
; VI: liveins: $sgpr4_sgpr5
; VI-NEXT: {{ $}}
; VI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; VI-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
; VI-NEXT: [[COPY1:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY1]], [[C]](s64)
; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
; VI: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
; VI-NEXT: [[C:%[0-9]+]]:_(p4) = G_CONSTANT i64 228
; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[C]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5)
; VI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
; VI-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[MV]](p0)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](p0)
; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY [[MV]](p0)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[COPY]](p0)
; GFX9-LABEL: name: test_addrspacecast_p5_fi_to_p0
; GFX9: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 30735

View File

@ -9,7 +9,8 @@
define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-LABEL: is_private_vgpr:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: s_load_dword s2, s[4:5], 0x32
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@ -18,9 +19,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_load_dword s0, s[4:5], 0x11
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1
; CI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v1
; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CI-NEXT: flat_store_dword v[0:1], v0
; CI-NEXT: s_endpgm
@ -81,9 +80,9 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; CI-LABEL: is_private_sgpr:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s0, s[4:5], 0x11
; CI-NEXT: s_load_dword s0, s[4:5], 0x32
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_lg_u32 s1, s0
; CI-NEXT: s_cbranch_scc1 .LBB1_2

View File

@ -9,7 +9,8 @@
define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-LABEL: is_local_vgpr:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: s_load_dword s2, s[4:5], 0x33
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@ -18,9 +19,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_load_dword s0, s[4:5], 0x10
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1
; CI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v1
; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CI-NEXT: flat_store_dword v[0:1], v0
; CI-NEXT: s_endpgm
@ -81,9 +80,9 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; CI-LABEL: is_local_sgpr:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s0, s[4:5], 0x10
; CI-NEXT: s_load_dword s0, s[4:5], 0x33
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_lg_u32 s1, s0
; CI-NEXT: s_cbranch_scc1 .LBB1_2

View File

@ -75,8 +75,8 @@ bb.2:
store volatile i32 0, ptr addrspace(1) undef
ret void
}
; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
; DEFAULTSIZE: ; ScratchSize: 4112
; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 16
; DEFAULTSIZE: ; ScratchSize: 16
; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
; ASSUME1024: ; ScratchSize: 1040
@ -137,8 +137,8 @@ bb.1:
ret void
}
; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
; DEFAULTSIZE: ; ScratchSize: 4160
; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 64
; DEFAULTSIZE: ; ScratchSize: 64
; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
; ASSUME1024: ; ScratchSize: 1088
@ -265,3 +265,6 @@ bb.1:
declare i32 @llvm.amdgcn.workitem.id.x() #0
attributes #0 = { nounwind readnone speculatable }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ASSUME1024: {{.*}}
; DEFAULTSIZE: {{.*}}

View File

@ -231,5 +231,5 @@ attributes #1 = { nounwind }
;.
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.

View File

@ -458,7 +458,7 @@ define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
; AKF_HSA-NEXT: ret void
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR8]] {
; ATTRIBUTOR_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(3)* [[PTR]] to i32 addrspace(4)*
; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, i32 addrspace(4)* [[STOF]], align 4
; ATTRIBUTOR_HSA-NEXT: ret void
@ -477,7 +477,7 @@ define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #2 {
; AKF_HSA-NEXT: ret void
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9
; ATTRIBUTOR_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
; ATTRIBUTOR_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(3)* [[PTR]] to i32 addrspace(4)*
; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, i32 addrspace(4)* [[STOF]], align 4
; ATTRIBUTOR_HSA-NEXT: ret void
@ -496,7 +496,7 @@ define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %p
; AKF_HSA-NEXT: ret void
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9
; ATTRIBUTOR_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
; ATTRIBUTOR_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR14:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(3)* [[PTR]] to i32 addrspace(4)*
; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, i32 addrspace(4)* [[STOF]], align 4
; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_use_queue_ptr()
@ -515,7 +515,7 @@ define void @indirect_use_group_to_flat_addrspacecast() #1 {
; AKF_HSA-NEXT: ret void
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] {
; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast(i32 addrspace(3)* null)
; ATTRIBUTOR_HSA-NEXT: ret void
;
@ -593,7 +593,7 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
; AKF_HSA-NEXT: ret void
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR14:[0-9]+]] {
; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
; ATTRIBUTOR_HSA-NEXT: store volatile i8 addrspace(4)* [[IMPLICITARG_PTR]], i8 addrspace(4)* addrspace(1)* undef, align 8
; ATTRIBUTOR_HSA-NEXT: ret void
@ -611,7 +611,7 @@ define void @use_implicitarg_ptr() #1 {
; AKF_HSA-NEXT: ret void
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR14]] {
; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
; ATTRIBUTOR_HSA-NEXT: store volatile i8 addrspace(4)* [[IMPLICITARG_PTR]], i8 addrspace(4)* addrspace(1)* undef, align 8
; ATTRIBUTOR_HSA-NEXT: ret void
@ -628,7 +628,7 @@ define void @func_indirect_use_implicitarg_ptr() #1 {
; AKF_HSA-NEXT: ret void
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR14]] {
; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_implicitarg_ptr()
; ATTRIBUTOR_HSA-NEXT: ret void
;
@ -944,13 +944,13 @@ attributes #5 = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" }
; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind sanitize_address "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }

View File

@ -431,7 +431,7 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt
; AKF_HSA-NEXT: ret void
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR11]] {
; ATTRIBUTOR_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(3)* [[PTR]] to i32*
; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, i32* [[STOF]], align 4
; ATTRIBUTOR_HSA-NEXT: ret void
@ -449,7 +449,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %
; AKF_HSA-NEXT: ret void
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: (i32 addrspace(5)* [[PTR:%.*]]) #[[ATTR11]] {
; ATTRIBUTOR_HSA-SAME: (i32 addrspace(5)* [[PTR:%.*]]) #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(5)* [[PTR]] to i32*
; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, i32* [[STOF]], align 4
; ATTRIBUTOR_HSA-NEXT: ret void
@ -541,7 +541,7 @@ define amdgpu_kernel void @use_is_shared(i8* %ptr) #1 {
; AKF_HSA-NEXT: ret void
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared
; ATTRIBUTOR_HSA-SAME: (i8* [[PTR:%.*]]) #[[ATTR11]] {
; ATTRIBUTOR_HSA-SAME: (i8* [[PTR:%.*]]) #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(i8* [[PTR]])
; ATTRIBUTOR_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32
; ATTRIBUTOR_HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* undef, align 4
@ -562,7 +562,7 @@ define amdgpu_kernel void @use_is_private(i8* %ptr) #1 {
; AKF_HSA-NEXT: ret void
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private
; ATTRIBUTOR_HSA-SAME: (i8* [[PTR:%.*]]) #[[ATTR11]] {
; ATTRIBUTOR_HSA-SAME: (i8* [[PTR:%.*]]) #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(i8* [[PTR]])
; ATTRIBUTOR_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32
; ATTRIBUTOR_HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* undef, align 4
@ -657,5 +657,6 @@ attributes #1 = { nounwind }
; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.

View File

@ -8,12 +8,12 @@
@alias = hidden alias void (), void ()* @aliasee_default
; ALL-LABEL: {{^}}kernel:
; GFX908: .amdhsa_next_free_vgpr 41
; GFX908: .amdhsa_next_free_vgpr 32
; GFX908-NEXT: .amdhsa_next_free_sgpr 33
; GFX90A: .amdhsa_next_free_vgpr 71
; GFX90A: .amdhsa_next_free_vgpr 59
; GFX90A-NEXT: .amdhsa_next_free_sgpr 33
; GFX90A-NEXT: .amdhsa_accum_offset 44
; GFX90A-NEXT: .amdhsa_accum_offset 32
define amdgpu_kernel void @kernel() #0 {
bb:
call void @alias() #2

View File

@ -9,7 +9,7 @@
; The parent kernel has a higher VGPR usage than the possible callees.
; CHECK-LABEL: {{^}}kernel1:
; CHECK: .amdhsa_next_free_vgpr 42
; CHECK: .amdhsa_next_free_vgpr 41
; CHECK-NEXT: .amdhsa_next_free_sgpr 33
define amdgpu_kernel void @kernel1() #0 {
bb:

View File

@ -5,20 +5,19 @@
define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
; GCN-LABEL: call_memory_arg_load:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s6, s[6:7], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GCN-NEXT: s_add_u32 s0, s0, s11
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: ds_read_b32 v0, v0
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[8:9]
; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_endpgm
%vgpr = load volatile i32, i32 addrspace(3)* %ptr
call void @func(i32 %vgpr)
@ -29,21 +28,20 @@ define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0
define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: call_memory_no_dep:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GCN-NEXT: s_add_u32 s0, s0, s11
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_store_dword v0, v0, s[6:7]
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: global_store_dword v0, v0, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[8:9]
; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: s_endpgm
store i32 0, i32 addrspace(1)* %ptr
call void @func(i32 0)
@ -54,19 +52,18 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: call_no_wait_after_call:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GCN-NEXT: s_add_u32 s0, s0, s11
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[8:9]
; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: global_store_dword v40, v40, s[34:35]
; GCN-NEXT: s_endpgm
call void @func(i32 0)
@ -77,19 +74,18 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: call_no_wait_after_call_return_val:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GCN-NEXT: s_add_u32 s0, s0, s11
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[8:9]
; GCN-NEXT: s_add_u32 s8, s8, func.return@rel32@lo+4
; GCN-NEXT: s_addc_u32 s9, s9, func.return@rel32@hi+12
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: global_store_dword v40, v0, s[34:35]
; GCN-NEXT: s_endpgm
%rv = call i32 @func.return(i32 0)
@ -101,19 +97,18 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)*
define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: call_got_load:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GCN-NEXT: s_add_u32 s0, s0, s11
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_endpgm
call void @got.func(i32 0)
ret void

View File

@ -1,5 +1,5 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=4 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}use_dispatch_ptr:
; GCN: s_load_dword s{{[0-9]+}}, s[4:5]

View File

@ -68,80 +68,76 @@ entry:
define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_add_i32 s10, s10, s15
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_add_u32 s0, s0, s17
; GFX803-NEXT: s_add_u32 s0, s0, s15
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s13, s15
; GFX803-NEXT: s_mov_b32 s12, s14
; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b32 s14, s16
; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX803-NEXT: s_mov_b32 s32, 0
; GFX803-NEXT: s_getpc_b64 s[18:19]
; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX803-NEXT: s_getpc_b64 s[16:17]
; GFX803-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX803-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_kern_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GFX900-NEXT: s_add_u32 s0, s0, s15
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s13, s15
; GFX900-NEXT: s_mov_b32 s12, s14
; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b32 s14, s16
; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_getpc_b64 s[18:19]
; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX900-NEXT: s_getpc_b64 s[16:17]
; GFX900-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX900-NEXT: s_endpgm
;
; GFX1010-LABEL: test_kern_call:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_add_u32 s12, s12, s17
; GFX1010-NEXT: s_add_u32 s10, s10, s15
; GFX1010-NEXT: s_mov_b32 s32, 0
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT: s_addc_u32 s11, s11, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: s_add_u32 s0, s0, s17
; GFX1010-NEXT: s_add_u32 s0, s0, s15
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_mov_b32 s13, s15
; GFX1010-NEXT: s_mov_b32 s12, s14
; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT: s_mov_b32 s14, s16
; GFX1010-NEXT: s_getpc_b64 s[18:19]
; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_getpc_b64 s[16:17]
; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_kern_call:
; GFX1100: ; %bb.0: ; %entry
; GFX1100-NEXT: v_mov_b32_e32 v31, v0
; GFX1100-NEXT: s_mov_b32 s12, s13
; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX1100-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX1100-NEXT: s_mov_b32 s13, s14
; GFX1100-NEXT: s_mov_b32 s14, s15
; GFX1100-NEXT: s_mov_b32 s32, 0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_getpc_b64 s[6:7]
; GFX1100-NEXT: s_add_u32 s6, s6, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s7, s7, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX1100-NEXT: s_endpgm
entry:
@ -152,72 +148,69 @@ entry:
define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_add_i32 s10, s10, s15
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_add_u32 s0, s0, s17
; GFX803-NEXT: s_add_u32 s0, s0, s15
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s13, s15
; GFX803-NEXT: s_mov_b32 s12, s14
; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX803-NEXT: v_mov_b32_e32 v3, 0
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b32 s14, s16
; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX803-NEXT: s_movk_i32 s32, 0x400
; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_getpc_b64 s[18:19]
; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX803-NEXT: s_getpc_b64 s[16:17]
; GFX803-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX803-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_kern_stack_and_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GFX900-NEXT: s_add_u32 s0, s0, s15
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s13, s15
; GFX900-NEXT: s_mov_b32 s12, s14
; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b32 s14, s16
; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX900-NEXT: s_movk_i32 s32, 0x400
; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_getpc_b64 s[18:19]
; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX900-NEXT: s_getpc_b64 s[16:17]
; GFX900-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX900-NEXT: s_endpgm
;
; GFX1010-LABEL: test_kern_stack_and_call:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_add_u32 s12, s12, s17
; GFX1010-NEXT: s_add_u32 s10, s10, s15
; GFX1010-NEXT: s_movk_i32 s32, 0x200
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT: s_addc_u32 s11, s11, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: v_mov_b32_e32 v3, 0
; GFX1010-NEXT: s_add_u32 s0, s0, s17
; GFX1010-NEXT: s_add_u32 s0, s0, s15
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_mov_b32 s13, s15
; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT: s_mov_b32 s12, s14
; GFX1010-NEXT: s_mov_b32 s14, s16
; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_getpc_b64 s[18:19]
; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_getpc_b64 s[16:17]
; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_kern_stack_and_call:
@ -225,19 +218,18 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX1100-NEXT: v_mov_b32_e32 v1, 0
; GFX1100-NEXT: v_mov_b32_e32 v31, v0
; GFX1100-NEXT: s_mov_b32 s12, s13
; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX1100-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX1100-NEXT: s_mov_b32 s13, s14
; GFX1100-NEXT: s_mov_b32 s14, s15
; GFX1100-NEXT: s_mov_b32 s32, 16
; GFX1100-NEXT: scratch_store_b32 off, v1, off offset:4 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_getpc_b64 s[6:7]
; GFX1100-NEXT: s_add_u32 s6, s6, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s7, s7, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
@ -321,84 +313,80 @@ entry:
define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX803-LABEL: test_force_fp_kern_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_add_i32 s10, s10, s15
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_add_u32 s0, s0, s17
; GFX803-NEXT: s_add_u32 s0, s0, s15
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s13, s15
; GFX803-NEXT: s_mov_b32 s12, s14
; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b32 s14, s16
; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX803-NEXT: s_mov_b32 s32, 0
; GFX803-NEXT: s_mov_b32 s33, 0
; GFX803-NEXT: s_getpc_b64 s[18:19]
; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX803-NEXT: s_getpc_b64 s[16:17]
; GFX803-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX803-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_force_fp_kern_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GFX900-NEXT: s_add_u32 s0, s0, s15
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s13, s15
; GFX900-NEXT: s_mov_b32 s12, s14
; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b32 s14, s16
; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_mov_b32 s33, 0
; GFX900-NEXT: s_getpc_b64 s[18:19]
; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX900-NEXT: s_getpc_b64 s[16:17]
; GFX900-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX900-NEXT: s_endpgm
;
; GFX1010-LABEL: test_force_fp_kern_call:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_add_u32 s12, s12, s17
; GFX1010-NEXT: s_add_u32 s10, s10, s15
; GFX1010-NEXT: s_mov_b32 s32, 0
; GFX1010-NEXT: s_mov_b32 s33, 0
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT: s_addc_u32 s11, s11, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: s_add_u32 s0, s0, s17
; GFX1010-NEXT: s_add_u32 s0, s0, s15
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_mov_b32 s13, s15
; GFX1010-NEXT: s_mov_b32 s12, s14
; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT: s_mov_b32 s14, s16
; GFX1010-NEXT: s_getpc_b64 s[18:19]
; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_getpc_b64 s[16:17]
; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_force_fp_kern_call:
; GFX1100: ; %bb.0: ; %entry
; GFX1100-NEXT: v_mov_b32_e32 v31, v0
; GFX1100-NEXT: s_mov_b32 s12, s13
; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX1100-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX1100-NEXT: s_mov_b32 s13, s14
; GFX1100-NEXT: s_mov_b32 s14, s15
; GFX1100-NEXT: s_mov_b32 s32, 0
; GFX1100-NEXT: s_mov_b32 s33, 0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_getpc_b64 s[6:7]
; GFX1100-NEXT: s_add_u32 s6, s6, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s7, s7, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX1100-NEXT: s_endpgm
; GFX1010-NEXT s_add_u32 s12, s12, s17
; GFX1010-NEXT s_mov_b32 s32, 0
@ -427,75 +415,72 @@ entry:
define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
; GFX803-LABEL: test_force_fp_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_add_i32 s10, s10, s15
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_add_u32 s0, s0, s17
; GFX803-NEXT: s_add_u32 s0, s0, s15
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 s33, 0
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s13, s15
; GFX803-NEXT: s_mov_b32 s12, s14
; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX803-NEXT: v_mov_b32_e32 v3, 0
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b32 s14, s16
; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX803-NEXT: s_movk_i32 s32, 0x400
; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_getpc_b64 s[18:19]
; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX803-NEXT: s_getpc_b64 s[16:17]
; GFX803-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX803-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_force_fp_kern_stack_and_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GFX900-NEXT: s_add_u32 s0, s0, s15
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_mov_b32 s33, 0
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s13, s15
; GFX900-NEXT: s_mov_b32 s12, s14
; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b32 s14, s16
; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX900-NEXT: s_movk_i32 s32, 0x400
; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_getpc_b64 s[18:19]
; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX900-NEXT: s_getpc_b64 s[16:17]
; GFX900-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX900-NEXT: s_endpgm
;
; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_add_u32 s12, s12, s17
; GFX1010-NEXT: s_add_u32 s10, s10, s15
; GFX1010-NEXT: s_movk_i32 s32, 0x200
; GFX1010-NEXT: s_mov_b32 s33, 0
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT: s_addc_u32 s11, s11, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: v_mov_b32_e32 v3, 0
; GFX1010-NEXT: s_add_u32 s0, s0, s17
; GFX1010-NEXT: s_add_u32 s0, s0, s15
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_mov_b32 s13, s15
; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT: s_mov_b32 s12, s14
; GFX1010-NEXT: s_mov_b32 s14, s16
; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_getpc_b64 s[18:19]
; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_getpc_b64 s[16:17]
; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_force_fp_kern_stack_and_call:
@ -504,19 +489,18 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX1100-NEXT: v_mov_b32_e32 v31, v0
; GFX1100-NEXT: s_mov_b32 s33, 0
; GFX1100-NEXT: s_mov_b32 s12, s13
; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX1100-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX1100-NEXT: s_mov_b32 s13, s14
; GFX1100-NEXT: s_mov_b32 s14, s15
; GFX1100-NEXT: s_mov_b32 s32, 16
; GFX1100-NEXT: scratch_store_b32 off, v1, s33 offset:4 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_getpc_b64 s[6:7]
; GFX1100-NEXT: s_add_u32 s6, s6, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s7, s7, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
entry:

View File

@ -168,30 +168,28 @@ bb1:
define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
; GCN-LABEL: v3i16_registers:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GCN-NEXT: s_load_dword s12, s[8:9], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GCN-NEXT: s_mov_b64 s[10:11], s[8:9]
; GCN-NEXT: s_load_dword s8, s[6:7], 0x0
; GCN-NEXT: s_add_u32 s0, s0, s15
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s12, 0
; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-NEXT: s_and_b64 vcc, exec, s[12:13]
; GCN-NEXT: s_bitcmp1_b32 s8, 0
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_and_b64 vcc, exec, s[8:9]
; GCN-NEXT: s_cbranch_vccnz .LBB4_2
; GCN-NEXT: ; %bb.1: ; %if.else
; GCN-NEXT: s_add_u32 s8, s8, 8
; GCN-NEXT: s_add_u32 s8, s6, 8
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: s_addc_u32 s9, s7, 0
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: s_getpc_b64 s[18:19]
; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func_v3i16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, func_v3i16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: s_branch .LBB4_3
; GCN-NEXT: .LBB4_2:
; GCN-NEXT: s_mov_b32 s4, 0
@ -221,30 +219,28 @@ if.end: ; preds = %if.else, %if.then
define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
; GCN-LABEL: v3f16_registers:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GCN-NEXT: s_load_dword s12, s[8:9], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GCN-NEXT: s_mov_b64 s[10:11], s[8:9]
; GCN-NEXT: s_load_dword s8, s[6:7], 0x0
; GCN-NEXT: s_add_u32 s0, s0, s15
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s12, 0
; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-NEXT: s_and_b64 vcc, exec, s[12:13]
; GCN-NEXT: s_bitcmp1_b32 s8, 0
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_and_b64 vcc, exec, s[8:9]
; GCN-NEXT: s_cbranch_vccnz .LBB5_2
; GCN-NEXT: ; %bb.1: ; %if.else
; GCN-NEXT: s_add_u32 s8, s8, 8
; GCN-NEXT: s_add_u32 s8, s6, 8
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: s_addc_u32 s9, s7, 0
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: s_getpc_b64 s[18:19]
; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func_v3f16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, func_v3f16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: s_branch .LBB5_3
; GCN-NEXT: .LBB5_2:
; GCN-NEXT: s_mov_b32 s4, 0

View File

@ -992,7 +992,7 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 a
;
; GFX9-LABEL: load_i8_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3]
@ -1073,7 +1073,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias
;
; GFX9-LABEL: load_v2i8_to_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -1162,7 +1162,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias
;
; GFX9-LABEL: load_v3i8_to_v3f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -1254,7 +1254,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias
;
; GFX9-LABEL: load_v4i8_to_v4f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -1378,7 +1378,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
;
; GFX9-LABEL: load_v4i8_to_v4f32_unaligned:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -1542,14 +1542,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
;
; GFX9-LABEL: load_v4i8_to_v4f32_2_uses:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_mov_b32_e32 v6, 9
; GFX9-NEXT: s_movk_i32 s4, 0x900
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v0, s[0:1]
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_movk_i32 s4, 0x900
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4
; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
@ -1739,7 +1739,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
;
; GFX9-LABEL: load_v7i8_to_v7f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -1879,7 +1879,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias
;
; GFX9-LABEL: load_v8i8_to_v8f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -1979,7 +1979,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias
;
; GFX9-LABEL: i8_zext_inreg_i32_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -2063,7 +2063,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias
;
; GFX9-LABEL: i8_zext_inreg_hi1_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -2143,7 +2143,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out,
;
; GFX9-LABEL: i8_zext_i32_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3]
@ -2256,7 +2256,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
;
; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -2355,7 +2355,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
;
; GFX9-LABEL: extract_byte0_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -2435,7 +2435,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out
;
; GFX9-LABEL: extract_byte1_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -2516,7 +2516,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out
;
; GFX9-LABEL: extract_byte2_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -2597,7 +2597,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out
;
; GFX9-LABEL: extract_byte3_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -2685,7 +2685,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float a
;
; GFX9-LABEL: cvt_ubyte0_or_multiuse:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)

View File

@ -20,16 +20,16 @@ define void @main(float %arg) {
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
; CHECK-NEXT: %20:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: %1:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, %20, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: %2:vgpr_32 = contract reassoc nofpexcept V_ADD_F32_e64 0, %1, 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[V_FMAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_ADD_F32_e64 0, [[V_FMAC_F32_e64_1]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.bb11:
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %1, %bb.1
; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %2, %bb.1
; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_FMAC_F32_e64_1]], %bb.1
; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_ADD_F32_e64_]], %bb.1
; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_MOV_B32_1]], %bb.0, [[S_MOV_B32_2]], %bb.1
; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI2]], implicit $exec
; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1

View File

@ -1337,33 +1337,31 @@ define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspa
; CI-NEXT: s_getpc_b64 s[40:41]
; CI-NEXT: s_mov_b32 s40, s0
; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0
; CI-NEXT: s_mov_b32 s14, s10
; CI-NEXT: s_mov_b32 s12, s8
; CI-NEXT: s_mov_b32 s13, s9
; CI-NEXT: s_mov_b64 s[10:11], s[4:5]
; CI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x0
; CI-NEXT: s_load_dword s4, s[2:3], 0x2
; CI-NEXT: s_mov_b32 s14, s8
; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s40, s40, s11
; CI-NEXT: s_mov_b64 s[10:11], s[6:7]
; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0
; CI-NEXT: s_load_dword s6, s[4:5], 0x2
; CI-NEXT: s_add_u32 s40, s40, s9
; CI-NEXT: s_addc_u32 s41, s41, 0
; CI-NEXT: s_add_u32 s8, s4, 12
; CI-NEXT: s_addc_u32 s9, s5, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_i32_e32 v40, vcc, s6, v3
; CI-NEXT: s_add_u32 s8, s2, 12
; CI-NEXT: s_addc_u32 s9, s3, 0
; CI-NEXT: s_getpc_b64 s[2:3]
; CI-NEXT: s_add_u32 s2, s2, void_func_void@gotpcrel32@lo+4
; CI-NEXT: s_addc_u32 s3, s3, void_func_void@gotpcrel32@hi+12
; CI-NEXT: v_add_i32_e32 v40, vcc, s4, v3
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; CI-NEXT: ds_read_b32 v41, v40
; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_mov_b64 s[0:1], s[40:41]
; CI-NEXT: v_or_b32_e32 v31, v0, v2
; CI-NEXT: s_mov_b32 s12, s6
; CI-NEXT: s_mov_b32 s13, s7
; CI-NEXT: s_mov_b64 s[2:3], s[42:43]
; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_mov_b32 s39, 0xf000
@ -1381,30 +1379,28 @@ define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspa
; GFX9-NEXT: s_getpc_b64 s[36:37]
; GFX9-NEXT: s_mov_b32 s36, s0
; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s12, s8
; GFX9-NEXT: s_mov_b32 s13, s9
; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x0
; GFX9-NEXT: s_mov_b32 s14, s8
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GFX9-NEXT: s_add_u32 s8, s4, 12
; GFX9-NEXT: s_addc_u32 s9, s5, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s6
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX9-NEXT: s_add_u32 s8, s2, 12
; GFX9-NEXT: s_addc_u32 s9, s3, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, void_func_void@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, void_func_void@gotpcrel32@hi+12
; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s4
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX9-NEXT: ds_read_b32 v42, v41
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: v_mov_b32_e32 v40, 0

View File

@ -49,19 +49,19 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7]
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[46:47]
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35]
; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43

View File

@ -13,7 +13,7 @@
; NONE: OS/ABI: SystemV (0x0)
; HSA: OS/ABI: AMDGPU_HSA (0x40)
; HSA: ABIVersion: 2
; HSA: ABIVersion: 3
; PAL: OS/ABI: AMDGPU_PAL (0x41)
; PAL: ABIVersion: 0
; MESA3D: OS/ABI: AMDGPU_MESA3D (0x42)

View File

@ -64,45 +64,43 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
define amdgpu_kernel void @kernel_calls_no_stack() {
; FLAT_SCR_OPT-LABEL: kernel_calls_no_stack:
; FLAT_SCR_OPT: ; %bb.0:
; FLAT_SCR_OPT-NEXT: s_add_u32 s8, s8, s13
; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11
; FLAT_SCR_OPT-NEXT: s_mov_b32 s32, 0
; FLAT_SCR_OPT-NEXT: s_addc_u32 s9, s9, 0
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[8:9], s[4:5]
; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
; FLAT_SCR_OPT-NEXT: s_mov_b32 s14, s10
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[10:11], s[4:5]
; FLAT_SCR_OPT-NEXT: s_getpc_b64 s[4:5]
; FLAT_SCR_OPT-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; FLAT_SCR_OPT-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
; FLAT_SCR_OPT-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; FLAT_SCR_OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; FLAT_SCR_OPT-NEXT: s_mov_b32 s14, s12
; FLAT_SCR_OPT-NEXT: s_mov_b32 s13, s11
; FLAT_SCR_OPT-NEXT: s_mov_b32 s12, s10
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[10:11], s[6:7]
; FLAT_SCR_OPT-NEXT: v_or3_b32 v31, v0, v1, v2
; FLAT_SCR_OPT-NEXT: s_mov_b32 s13, s9
; FLAT_SCR_OPT-NEXT: s_mov_b32 s12, s8
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[4:5], s[0:1]
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[6:7], s[2:3]
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[8:9], s[2:3]
; FLAT_SCR_OPT-NEXT: v_or3_b32 v31, v0, v1, v2
; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT_SCR_OPT-NEXT: s_swappc_b64 s[30:31], s[16:17]
; FLAT_SCR_OPT-NEXT: s_swappc_b64 s[30:31], s[6:7]
; FLAT_SCR_OPT-NEXT: s_endpgm
;
; FLAT_SCR_ARCH-LABEL: kernel_calls_no_stack:
; FLAT_SCR_ARCH: ; %bb.0:
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s13, s9
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s12, s8
; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[8:9], s[4:5]
; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[10:11], s[4:5]
; FLAT_SCR_ARCH-NEXT: s_getpc_b64 s[4:5]
; FLAT_SCR_ARCH-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; FLAT_SCR_ARCH-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
; FLAT_SCR_ARCH-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; FLAT_SCR_ARCH-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s14, s10
; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[10:11], s[6:7]
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s14, s8
; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[4:5], s[0:1]
; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[6:7], s[2:3]
; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[8:9], s[2:3]
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s12, s6
; FLAT_SCR_ARCH-NEXT: v_or3_b32 v31, v0, v1, v2
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s13, s7
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s32, 0
; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0)
; FLAT_SCR_ARCH-NEXT: s_swappc_b64 s[30:31], s[16:17]

View File

@ -1,4 +1,4 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn --amdhsa-code-object-version=4 -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=VI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600,FUNC %s

View File

@ -36,6 +36,7 @@
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
; GCN-NEXT: .amdhsa_enable_private_segment 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
@ -64,6 +65,7 @@ define amdgpu_kernel void @minimal_kernel_inputs() {
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
; GCN-NEXT: .amdhsa_enable_private_segment 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
@ -81,7 +83,7 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() {
}
; GCN-LABEL: {{^}}queue_ptr:
; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
; ; GCN: global_load_u8 v{{[0-9]+}},
; WORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s15
; NOWORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s2
@ -91,11 +93,12 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() {
; WORKAROUND: .amdhsa_user_sgpr_count 15
; NOWORKAROUND: .amdhsa_user_sgpr_count 2
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
; GCN-NEXT: .amdhsa_enable_private_segment 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
@ -117,16 +120,16 @@ define amdgpu_kernel void @queue_ptr() {
; WORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s14
; WORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s15
; NOWORKAROUND: v_mov_b32_e32 [[V_X:v[0-9]+]], s8
; NOWORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s9
; NOWORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s10
; NOWORKAROUND: v_mov_b32_e32 [[V_X:v[0-9]+]], s6
; NOWORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s7
; NOWORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s8
; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
; GCN: global_load_u8 v{{[0-9]+}},
; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[2:3]
; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[4:5]
; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_LO:[0-9]+]], s6
; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_HI:[0-9]+]], s7
; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_LO:[0-9]+]], s4
; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_HI:[0-9]+]], s5
; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_X]], off
; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_Y]], off
@ -135,13 +138,14 @@ define amdgpu_kernel void @queue_ptr() {
; GCN: .amdhsa_kernel all_inputs
; WORKAROUND: .amdhsa_user_sgpr_count 13
; NOWORKAROUND: .amdhsa_user_sgpr_count 8
; NOWORKAROUND: .amdhsa_user_sgpr_count 6
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
; GCN-NEXT: .amdhsa_enable_private_segment 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
@ -149,7 +153,7 @@ define amdgpu_kernel void @queue_ptr() {
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 13
; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 8
; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 6
define amdgpu_kernel void @all_inputs() {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca

View File

@ -9,48 +9,45 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_mov_b32 s32, 0x180000
; CHECK-NEXT: s_mov_b32 s33, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_writelane_b32 v40, s16, 0
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v40, 0
; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9]
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: v_mov_b32_e32 v3, v2
; CHECK-NEXT: v_mov_b32_e32 v2, v1
; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0
; CHECK-NEXT: s_load_dword s8, s[6:7], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v40, s8, 1
; CHECK-NEXT: v_writelane_b32 v40, s8, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def vgpr10
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_add_i32 s8, s33, 0x100100
; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 s[18:19], 8
; CHECK-NEXT: s_mov_b32 s8, s16
; CHECK-NEXT: s_mov_b32 s9, s17
; CHECK-NEXT: s_mov_b32 s16, s18
; CHECK-NEXT: s_mov_b32 s15, s19
; CHECK-NEXT: s_add_u32 s8, s8, s16
; CHECK-NEXT: s_addc_u32 s15, s9, s15
; CHECK-NEXT: s_mov_b64 s[16:17], 8
; CHECK-NEXT: s_mov_b32 s8, s6
; CHECK-NEXT: s_mov_b32 s6, s7
; CHECK-NEXT: s_mov_b32 s9, s16
; CHECK-NEXT: s_mov_b32 s7, s17
; CHECK-NEXT: s_add_u32 s8, s8, s9
; CHECK-NEXT: s_addc_u32 s6, s6, s7
; CHECK-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; CHECK-NEXT: s_mov_b32 s9, s15
; CHECK-NEXT: s_mov_b32 s9, s6
; CHECK-NEXT: v_mov_b32_e32 v0, 0x2000
; CHECK-NEXT: ; implicit-def: $sgpr15
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, device_func@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, device_func@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; CHECK-NEXT: ; implicit-def: $sgpr6
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, device_func@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, device_func@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_mov_b32 s15, 20
; CHECK-NEXT: v_lshlrev_b32_e64 v3, s15, v3
; CHECK-NEXT: s_mov_b32 s15, 10
; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2
; CHECK-NEXT: s_mov_b32 s6, 20
; CHECK-NEXT: v_lshlrev_b32_e64 v3, s6, v3
; CHECK-NEXT: s_mov_b32 s6, 10
; CHECK-NEXT: v_lshlrev_b32_e64 v2, s6, v2
; CHECK-NEXT: v_or3_b32 v31, v1, v2, v3
; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7
; CHECK-NEXT: ; implicit-def: $sgpr15
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
@ -58,7 +55,7 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_add_i32 s4, s33, 0x100100
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s4, v40, 1
; CHECK-NEXT: v_readlane_b32 s4, v40, 0
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: s_cmp_eq_u32 s4, s5
; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000

View File

@ -59,19 +59,20 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) #1 {
define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
; CHECK-LABEL: module_1_kernel_normal_extern_normal:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s8, s8, s11
; CHECK-NEXT: s_add_u32 s6, s6, s9
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; CHECK-NEXT: s_add_u32 s0, s0, s11
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
; CHECK-NEXT: s_add_u32 s0, s0, s9
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_getpc_b64 s[8:9]
; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0
; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
; CHECK-NEXT: s_lshl_b32 s4, s12, 2
@ -117,19 +118,20 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) #1
define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
; CHECK-LABEL: module_1_kernel_overalign_extern_normal:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s8, s8, s11
; CHECK-NEXT: s_add_u32 s6, s6, s9
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; CHECK-NEXT: s_add_u32 s0, s0, s11
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
; CHECK-NEXT: s_add_u32 s0, s0, s9
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_getpc_b64 s[8:9]
; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0
; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
; CHECK-NEXT: s_lshl_b32 s4, s12, 2
@ -175,19 +177,20 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) #1
define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
; CHECK-LABEL: module_1_kernel_normal_extern_overalign:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s8, s8, s11
; CHECK-NEXT: s_add_u32 s6, s6, s9
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; CHECK-NEXT: s_add_u32 s0, s0, s11
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
; CHECK-NEXT: s_add_u32 s0, s0, s9
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_getpc_b64 s[8:9]
; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0
; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
; CHECK-NEXT: s_lshl_b32 s4, s12, 2
@ -233,19 +236,20 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) {
; CHECK-LABEL: module_1_kernel_overalign_extern_overalign:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s8, s8, s11
; CHECK-NEXT: s_add_u32 s6, s6, s9
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; CHECK-NEXT: s_add_u32 s0, s0, s11
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
; CHECK-NEXT: s_add_u32 s0, s0, s9
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_getpc_b64 s[8:9]
; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0
; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
; CHECK-NEXT: s_lshl_b32 s4, s12, 2

View File

@ -14,10 +14,11 @@ define void @func_use_lds_global() {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: ds_write_b32 v0, v0
; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-NEXT: s_trap 2
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_trap 2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: func_use_lds_global:
@ -37,7 +38,9 @@ define void @func_use_lds_global_constexpr_cast() {
; GFX8-LABEL: func_use_lds_global_constexpr_cast:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_trap 2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -1,4 +1,4 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs --amdhsa-code-object-version=4 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}is_private_vgpr:

View File

@ -1,4 +1,4 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs --amdhsa-code-object-version=4 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}is_local_vgpr:

View File

@ -17,7 +17,7 @@ define amdgpu_kernel void @kern_noargs() {
define amdgpu_kernel void @kern_i8(i8 %arg) #0 {
; HSA-LABEL: @kern_i8(
; HSA-NEXT: [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -26,7 +26,7 @@ define amdgpu_kernel void @kern_i8(i8 %arg) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_i8(
; MESA-NEXT: [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -40,7 +40,7 @@ define amdgpu_kernel void @kern_i8(i8 %arg) #0 {
define amdgpu_kernel void @kern_i16(i16 %arg) #0 {
; HSA-LABEL: @kern_i16(
; HSA-NEXT: [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -49,7 +49,7 @@ define amdgpu_kernel void @kern_i16(i16 %arg) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_i16(
; MESA-NEXT: [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -63,7 +63,7 @@ define amdgpu_kernel void @kern_i16(i16 %arg) #0 {
define amdgpu_kernel void @kern_f16(half %arg) #0 {
; HSA-LABEL: @kern_f16(
; HSA-NEXT: [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -73,7 +73,7 @@ define amdgpu_kernel void @kern_f16(half %arg) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_f16(
; MESA-NEXT: [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -88,7 +88,7 @@ define amdgpu_kernel void @kern_f16(half %arg) #0 {
define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 {
; HSA-LABEL: @kern_zeroext_i8(
; HSA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -97,7 +97,7 @@ define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_zeroext_i8(
; MESA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -111,7 +111,7 @@ define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 {
define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 {
; HSA-LABEL: @kern_zeroext_i16(
; HSA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -120,7 +120,7 @@ define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_zeroext_i16(
; MESA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -134,7 +134,7 @@ define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 {
define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 {
; HSA-LABEL: @kern_signext_i8(
; HSA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -143,7 +143,7 @@ define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_signext_i8(
; MESA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -157,7 +157,7 @@ define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 {
define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 {
; HSA-LABEL: @kern_signext_i16(
; HSA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -166,7 +166,7 @@ define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_signext_i16(
; MESA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -180,7 +180,7 @@ define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 {
define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) {
; HSA-LABEL: @kern_i8_i8(
; HSA-NEXT: [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -195,7 +195,7 @@ define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_i8_i8(
; MESA-NEXT: [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -216,7 +216,7 @@ define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) {
define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) {
; HSA-LABEL: @kern_v3i8(
; HSA-NEXT: [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -226,7 +226,7 @@ define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_v3i8(
; MESA-NEXT: [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -241,7 +241,7 @@ define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) {
define amdgpu_kernel void @kern_i24(i24 %arg0) {
; HSA-LABEL: @kern_i24(
; HSA-NEXT: [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -250,7 +250,7 @@ define amdgpu_kernel void @kern_i24(i24 %arg0) {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_i24(
; MESA-NEXT: [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -264,7 +264,7 @@ define amdgpu_kernel void @kern_i24(i24 %arg0) {
define amdgpu_kernel void @kern_i32(i32 %arg0) {
; HSA-LABEL: @kern_i32(
; HSA-NEXT: [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -272,7 +272,7 @@ define amdgpu_kernel void @kern_i32(i32 %arg0) {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_i32(
; MESA-NEXT: [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -285,7 +285,7 @@ define amdgpu_kernel void @kern_i32(i32 %arg0) {
define amdgpu_kernel void @kern_f32(float %arg0) {
; HSA-LABEL: @kern_f32(
; HSA-NEXT: [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)*
; HSA-NEXT: [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -293,7 +293,7 @@ define amdgpu_kernel void @kern_f32(float %arg0) {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_f32(
; MESA-NEXT: [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)*
; MESA-NEXT: [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -306,7 +306,7 @@ define amdgpu_kernel void @kern_f32(float %arg0) {
define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) {
; HSA-LABEL: @kern_v3i32(
; HSA-NEXT: [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -315,7 +315,7 @@ define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_v3i32(
; MESA-NEXT: [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -329,7 +329,7 @@ define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) {
define amdgpu_kernel void @kern_v8i32(<8 x i32> %arg) #0 {
; HSA-LABEL: @kern_v8i32(
; HSA-NEXT: [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(88) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(288) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)*
; HSA-NEXT: [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -337,7 +337,7 @@ define amdgpu_kernel void @kern_v8i32(<8 x i32> %arg) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_v8i32(
; MESA-NEXT: [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(88) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(288) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)*
; MESA-NEXT: [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -350,7 +350,7 @@ define amdgpu_kernel void @kern_v8i32(<8 x i32> %arg) #0 {
define amdgpu_kernel void @kern_v8i64(<8 x i64> %arg) #0 {
; HSA-LABEL: @kern_v8i64(
; HSA-NEXT: [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(120) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(320) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)*
; HSA-NEXT: [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -358,7 +358,7 @@ define amdgpu_kernel void @kern_v8i64(<8 x i64> %arg) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_v8i64(
; MESA-NEXT: [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(120) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(320) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)*
; MESA-NEXT: [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -371,7 +371,7 @@ define amdgpu_kernel void @kern_v8i64(<8 x i64> %arg) #0 {
define amdgpu_kernel void @kern_v16i64(<16 x i64> %arg) #0 {
; HSA-LABEL: @kern_v16i64(
; HSA-NEXT: [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(184) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(384) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)*
; HSA-NEXT: [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -379,7 +379,7 @@ define amdgpu_kernel void @kern_v16i64(<16 x i64> %arg) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_v16i64(
; MESA-NEXT: [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(184) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(384) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)*
; MESA-NEXT: [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -392,7 +392,7 @@ define amdgpu_kernel void @kern_v16i64(<16 x i64> %arg) #0 {
define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
; HSA-LABEL: @kern_i32_v3i32(
; HSA-NEXT: [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(88) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -405,7 +405,7 @@ define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_i32_v3i32(
; MESA-NEXT: [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(88) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -427,7 +427,7 @@ define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
; HSA-LABEL: @kern_struct_a(
; HSA-NEXT: [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)*
; HSA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -435,7 +435,7 @@ define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_struct_a(
; MESA-NEXT: [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)*
; MESA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -448,7 +448,7 @@ define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
; HSA-LABEL: @kern_struct_b_packed(
; HSA-NEXT: [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(88) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)*
; HSA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -456,7 +456,7 @@ define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_struct_b_packed(
; MESA-NEXT: [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(88) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)*
; MESA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -511,7 +511,7 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align(<16 x i32>, i32 %a
define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 {
; HSA-LABEL: @kern_lds_ptr(
; HSA-NEXT: [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)*
; HSA-NEXT: [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -519,7 +519,7 @@ define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_lds_ptr(
; MESA-NEXT: [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)*
; MESA-NEXT: [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -532,12 +532,12 @@ define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 {
define amdgpu_kernel void @kern_lds_ptr_si(i32 addrspace(3)* %lds) #2 {
; HSA-LABEL: @kern_lds_ptr_si(
; HSA-NEXT: [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_lds_ptr_si(
; MESA-NEXT: [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
; MESA-NEXT: ret void
;
@ -547,7 +547,7 @@ define amdgpu_kernel void @kern_lds_ptr_si(i32 addrspace(3)* %lds) #2 {
define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
; HSA-LABEL: @kern_realign_i8_i8(
; HSA-NEXT: [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -562,7 +562,7 @@ define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_realign_i8_i8(
; MESA-NEXT: [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -583,7 +583,7 @@ define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #0 {
; HSA-LABEL: @kern_realign_i8_i8_i8(
; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -604,7 +604,7 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_realign_i8_i8_i8(
; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -632,7 +632,7 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #
define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) #0 {
; HSA-LABEL: @kern_realign_i8_i8_i8_i8(
; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -659,7 +659,7 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_realign_i8_i8_i8_i8(
; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -694,7 +694,7 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2
define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
; HSA-LABEL: @kern_realign_i8_v3i8(
; HSA-NEXT: [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -709,7 +709,7 @@ define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_realign_i8_v3i8(
; MESA-NEXT: [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -730,7 +730,7 @@ define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
; HSA-LABEL: @kern_realign_i8_i16(
; HSA-NEXT: [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -745,7 +745,7 @@ define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_realign_i8_i16(
; MESA-NEXT: [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -766,7 +766,7 @@ define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
; HSA-LABEL: @kern_realign_i1_i1(
; HSA-NEXT: [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -781,7 +781,7 @@ define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_realign_i1_i1(
; MESA-NEXT: [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -802,7 +802,7 @@ define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #0 {
; HSA-LABEL: @kern_realign_i1_i1_i1(
; HSA-NEXT: [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -823,7 +823,7 @@ define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_realign_i1_i1_i1(
; MESA-NEXT: [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -851,7 +851,7 @@ define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #
define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3) #0 {
; HSA-LABEL: @kern_realign_i1_i1_i1_i1(
; HSA-NEXT: [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -878,7 +878,7 @@ define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_realign_i1_i1_i1_i1(
; MESA-NEXT: [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -913,7 +913,7 @@ define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2
define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
; HSA-LABEL: @kern_realign_i1_v3i1(
; HSA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -929,7 +929,7 @@ define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_realign_i1_v3i1(
; MESA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -951,7 +951,7 @@ define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
; HSA-LABEL: @kern_realign_i1_i16(
; HSA-NEXT: [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -966,7 +966,7 @@ define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_realign_i1_i16(
; MESA-NEXT: [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -987,7 +987,7 @@ define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, i8 %arg5, i8 %arg6, i8 %arg7) #0 {
; HSA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -1032,7 +1032,7 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %ar
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -1088,7 +1088,7 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %ar
define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
; HSA-LABEL: @kern_realign_f16_f16(
; HSA-NEXT: [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
@ -1105,7 +1105,7 @@ define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_realign_f16_f16(
; MESA-NEXT: [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
@ -1128,7 +1128,7 @@ define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 {
; HSA-LABEL: @kern_global_ptr(
; HSA-NEXT: [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1136,7 +1136,7 @@ define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_global_ptr(
; MESA-NEXT: [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1149,7 +1149,7 @@ define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 {
define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* dereferenceable(42) %ptr) #0 {
; HSA-LABEL: @kern_global_ptr_dereferencable(
; HSA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable !1
@ -1157,7 +1157,7 @@ define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* deref
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_global_ptr_dereferencable(
; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable !1
@ -1170,7 +1170,7 @@ define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* deref
define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1)* dereferenceable_or_null(128) %ptr) #0 {
; HSA-LABEL: @kern_global_ptr_dereferencable_or_null(
; HSA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable_or_null !2
@ -1178,7 +1178,7 @@ define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_global_ptr_dereferencable_or_null(
; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable_or_null !2
@ -1191,7 +1191,7 @@ define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1
define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr) #0 {
; HSA-LABEL: @kern_nonnull_global_ptr(
; HSA-NEXT: [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !nonnull !0
@ -1199,7 +1199,7 @@ define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_nonnull_global_ptr(
; MESA-NEXT: [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !nonnull !0
@ -1212,7 +1212,7 @@ define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr
define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %ptr) #0 {
; HSA-LABEL: @kern_align32_global_ptr(
; HSA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !align !3
@ -1220,7 +1220,7 @@ define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_align32_global_ptr(
; MESA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !align !3
@ -1233,12 +1233,12 @@ define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %
define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr) #0 {
; HSA-LABEL: @kern_noalias_global_ptr(
; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_noalias_global_ptr(
; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
; MESA-NEXT: ret void
;
@ -1248,13 +1248,13 @@ define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr
define amdgpu_kernel void @kern_noalias_global_ptr_x2(i8 addrspace(1)* noalias %ptr0, i8 addrspace(1)* noalias %ptr1) #0 {
; HSA-LABEL: @kern_noalias_global_ptr_x2(
; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
; HSA-NEXT: ret void
;
; MESA-LABEL: @kern_noalias_global_ptr_x2(
; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
; MESA-NEXT: ret void
@ -1267,7 +1267,7 @@ define amdgpu_kernel void @kern_noalias_global_ptr_x2(i8 addrspace(1)* noalias %
define amdgpu_kernel void @struct_i8_i8_arg({i8, i8} %in) #0 {
; HSA-LABEL: @struct_i8_i8_arg(
; HSA-NEXT: entry:
; HSA-NEXT: [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i8 } addrspace(4)*
; HSA-NEXT: [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1279,7 +1279,7 @@ define amdgpu_kernel void @struct_i8_i8_arg({i8, i8} %in) #0 {
;
; MESA-LABEL: @struct_i8_i8_arg(
; MESA-NEXT: entry:
; MESA-NEXT: [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i8 } addrspace(4)*
; MESA-NEXT: [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1300,7 +1300,7 @@ entry:
define amdgpu_kernel void @struct_i8_i16_arg({i8, i16} %in) #0 {
; HSA-LABEL: @struct_i8_i16_arg(
; HSA-NEXT: entry:
; HSA-NEXT: [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i16 } addrspace(4)*
; HSA-NEXT: [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1312,7 +1312,7 @@ define amdgpu_kernel void @struct_i8_i16_arg({i8, i16} %in) #0 {
;
; MESA-LABEL: @struct_i8_i16_arg(
; MESA-NEXT: entry:
; MESA-NEXT: [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i16 } addrspace(4)*
; MESA-NEXT: [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1333,7 +1333,7 @@ entry:
define amdgpu_kernel void @array_2xi8_arg([2 x i8] %in) #0 {
; HSA-LABEL: @array_2xi8_arg(
; HSA-NEXT: entry:
; HSA-NEXT: [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i8] addrspace(4)*
; HSA-NEXT: [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1345,7 +1345,7 @@ define amdgpu_kernel void @array_2xi8_arg([2 x i8] %in) #0 {
;
; MESA-LABEL: @array_2xi8_arg(
; MESA-NEXT: entry:
; MESA-NEXT: [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i8] addrspace(4)*
; MESA-NEXT: [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1366,7 +1366,7 @@ entry:
define amdgpu_kernel void @array_2xi1_arg([2 x i1] %in) #0 {
; HSA-LABEL: @array_2xi1_arg(
; HSA-NEXT: entry:
; HSA-NEXT: [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i1] addrspace(4)*
; HSA-NEXT: [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1378,7 +1378,7 @@ define amdgpu_kernel void @array_2xi1_arg([2 x i1] %in) #0 {
;
; MESA-LABEL: @array_2xi1_arg(
; MESA-NEXT: entry:
; MESA-NEXT: [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i1] addrspace(4)*
; MESA-NEXT: [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1398,11 +1398,11 @@ entry:
define amdgpu_kernel void @only_empty_struct({} %empty) #0 {
; HSA-LABEL: @only_empty_struct(
; HSA-NEXT: [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(56) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(256) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: ret void
;
; MESA-LABEL: @only_empty_struct(
; MESA-NEXT: [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(56) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(256) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: ret void
;
ret void
@ -1410,7 +1410,7 @@ define amdgpu_kernel void @only_empty_struct({} %empty) #0 {
define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
; HSA-LABEL: @empty_struct_with_other(
; HSA-NEXT: [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
; HSA-NEXT: [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1418,7 +1418,7 @@ define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
; HSA-NEXT: ret void
;
; MESA-LABEL: @empty_struct_with_other(
; MESA-NEXT: [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
; MESA-NEXT: [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1433,7 +1433,7 @@ define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
define amdgpu_kernel void @static_alloca_kern_i32(i32 %arg0) {
; HSA-LABEL: @static_alloca_kern_i32(
; HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
; HSA-NEXT: [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1442,7 +1442,7 @@ define amdgpu_kernel void @static_alloca_kern_i32(i32 %arg0) {
;
; MESA-LABEL: @static_alloca_kern_i32(
; MESA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
; MESA-NEXT: [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1459,7 +1459,7 @@ define amdgpu_kernel void @static_alloca_kern_i32(i32 %arg0) {
define amdgpu_kernel void @dyn_alloca_kernarg_i32(i32 %n) {
; HSA-LABEL: @dyn_alloca_kernarg_i32(
; HSA-NEXT: [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
; HSA-NEXT: [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[N_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[N_KERNARG_OFFSET]] to i32 addrspace(4)*
; HSA-NEXT: [[N_LOAD:%.*]] = load i32, i32 addrspace(4)* [[N_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1470,7 +1470,7 @@ define amdgpu_kernel void @dyn_alloca_kernarg_i32(i32 %n) {
;
; MESA-LABEL: @dyn_alloca_kernarg_i32(
; MESA-NEXT: [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
; MESA-NEXT: [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[N_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[N_KERNARG_OFFSET]] to i32 addrspace(4)*
; MESA-NEXT: [[N_LOAD:%.*]] = load i32, i32 addrspace(4)* [[N_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1489,7 +1489,7 @@ define amdgpu_kernel void @dyn_alloca_kernarg_i32(i32 %n) {
; Byref pointers should only be treated as offsets from kernarg
define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) {
; HSA-LABEL: @byref_constant_i8_arg(
; HSA-NEXT: [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1500,7 +1500,7 @@ define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %ou
; HSA-NEXT: ret void
;
; MESA-LABEL: @byref_constant_i8_arg(
; MESA-NEXT: [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1518,7 +1518,7 @@ define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %ou
define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) {
; HSA-LABEL: @byref_constant_i16_arg(
; HSA-NEXT: [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1530,7 +1530,7 @@ define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %o
; HSA-NEXT: ret void
;
; MESA-LABEL: @byref_constant_i16_arg(
; MESA-NEXT: [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1549,7 +1549,7 @@ define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %o
define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in.byref, i32 %after.offset) {
; HSA-LABEL: @byref_constant_i32_arg(
; HSA-NEXT: [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1564,7 +1564,7 @@ define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %o
; HSA-NEXT: ret void
;
; MESA-LABEL: @byref_constant_i32_arg(
; MESA-NEXT: [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1586,7 +1586,7 @@ define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %o
define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byref(<4 x i32>) %in.byref, i32 %after.offset) {
; HSA-LABEL: @byref_constant_v4i32_arg(
; HSA-NEXT: [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(96) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(296) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)*
; HSA-NEXT: [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1602,7 +1602,7 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* noca
; HSA-NEXT: ret void
;
; MESA-LABEL: @byref_constant_v4i32_arg(
; MESA-NEXT: [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(92) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(292) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)*
; MESA-NEXT: [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1626,7 +1626,7 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* noca
define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) {
; HSA-LABEL: @byref_align_constant_i32_arg(
; HSA-NEXT: [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(320) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(520) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1641,7 +1641,7 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapt
; HSA-NEXT: ret void
;
; MESA-LABEL: @byref_align_constant_i32_arg(
; MESA-NEXT: [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(320) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(520) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1663,7 +1663,7 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapt
define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) {
; HSA-LABEL: @byref_natural_align_constant_v16i32_arg(
; HSA-NEXT: [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(192) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(392) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1679,7 +1679,7 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace
; HSA-NEXT: ret void
;
; MESA-LABEL: @byref_natural_align_constant_v16i32_arg(
; MESA-NEXT: [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(188) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(388) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1704,7 +1704,7 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace
; Also accept byref kernel arguments with other global address spaces.
define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byref(i32) %in.byref) {
; HSA-LABEL: @byref_global_i32_arg(
; HSA-NEXT: [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1715,7 +1715,7 @@ define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out
; HSA-NEXT: ret void
;
; MESA-LABEL: @byref_global_i32_arg(
; MESA-NEXT: [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1732,7 +1732,7 @@ define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out
define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byref(i32) %in.byref) {
; HSA-LABEL: @byref_flat_i32_arg(
; HSA-NEXT: [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1743,7 +1743,7 @@ define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out,
; HSA-NEXT: ret void
;
; MESA-LABEL: @byref_flat_i32_arg(
; MESA-NEXT: [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1760,7 +1760,7 @@ define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out,
define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) %in.byref) {
; HSA-LABEL: @byref_constant_32bit_i32_arg(
; HSA-NEXT: [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1771,7 +1771,7 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapt
; HSA-NEXT: ret void
;
; MESA-LABEL: @byref_constant_32bit_i32_arg(
; MESA-NEXT: [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1788,7 +1788,7 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapt
define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byref(i32) %in.byref) {
; HSA-LABEL: @byref_unknown_as_i32_arg(
; HSA-NEXT: [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1799,7 +1799,7 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture
; HSA-NEXT: ret void
;
; MESA-LABEL: @byref_unknown_as_i32_arg(
; MESA-NEXT: [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1817,7 +1817,7 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture
; Invalid, but should not crash.
define amdgpu_kernel void @byref_local_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(3)* byref(i32) %in.byref) {
; HSA-LABEL: @byref_local_i32_arg(
; HSA-NEXT: [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1828,7 +1828,7 @@ define amdgpu_kernel void @byref_local_i32_arg(i32 addrspace(1)* nocapture %out,
; HSA-NEXT: ret void
;
; MESA-LABEL: @byref_local_i32_arg(
; MESA-NEXT: [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1845,7 +1845,7 @@ define amdgpu_kernel void @byref_local_i32_arg(i32 addrspace(1)* nocapture %out,
define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in0.byref, i32 addrspace(4)* byref(i32) %in1.byref, i32 %after.offset) {
; HSA-LABEL: @multi_byref_constant_i32_arg(
; HSA-NEXT: [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(80) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(280) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
@ -1864,7 +1864,7 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapt
; HSA-NEXT: ret void
;
; MESA-LABEL: @multi_byref_constant_i32_arg(
; MESA-NEXT: [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(76) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(276) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
@ -1892,7 +1892,7 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapt
define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byref(i32) %in.byref) {
; HSA-LABEL: @byref_constant_i32_arg_offset0(
; HSA-NEXT: [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 0
; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
@ -1900,7 +1900,7 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byre
; HSA-NEXT: ret void
;
; MESA-LABEL: @byref_constant_i32_arg_offset0(
; MESA-NEXT: [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 36
; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4

View File

@ -28,20 +28,21 @@ store i32 0, i32 addrspace(3)* @used_by_kernel
define amdgpu_kernel void @withcall() {
; GFX9-LABEL: withcall:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s11, 0xe00000
; GFX9-NEXT: s_add_u32 s8, s8, s3
; GFX9-NEXT: s_addc_u32 s9, s9, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[8:9]
; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s14, -1
; GFX9-NEXT: s_mov_b32 s15, 0xe00000
; GFX9-NEXT: s_add_u32 s12, s12, s3
; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_add_u32 s8, s0, 36
; GFX9-NEXT: s_addc_u32 s9, s1, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[12:13]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11]
; GFX9-NEXT: s_mov_b64 s[2:3], s[14:15]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -50,20 +51,21 @@ define amdgpu_kernel void @withcall() {
;
; GFX10-LABEL: withcall:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s10, -1
; GFX10-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-NEXT: s_add_u32 s8, s8, s3
; GFX10-NEXT: s_addc_u32 s9, s9, 0
; GFX10-NEXT: s_getpc_b64 s[2:3]
; GFX10-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12
; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s14, -1
; GFX10-NEXT: s_mov_b32 s15, 0x31c16000
; GFX10-NEXT: s_add_u32 s12, s12, s3
; GFX10-NEXT: s_addc_u32 s13, s13, 0
; GFX10-NEXT: s_add_u32 s8, s0, 36
; GFX10-NEXT: s_addc_u32 s9, s1, 0
; GFX10-NEXT: s_getpc_b64 s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX10-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9]
; GFX10-NEXT: s_mov_b64 s[2:3], s[10:11]
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT: s_mov_b64 s[0:1], s[12:13]
; GFX10-NEXT: s_mov_b64 s[2:3], s[14:15]
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@ -72,21 +74,22 @@ define amdgpu_kernel void @withcall() {
;
; G_GFX9-LABEL: withcall:
; G_GFX9: ; %bb.0:
; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; G_GFX9-NEXT: s_mov_b32 s10, -1
; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000
; G_GFX9-NEXT: s_add_u32 s8, s8, s3
; G_GFX9-NEXT: s_addc_u32 s9, s9, 0
; G_GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; G_GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; G_GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; G_GFX9-NEXT: s_mov_b32 s14, -1
; G_GFX9-NEXT: s_mov_b32 s15, 0xe00000
; G_GFX9-NEXT: s_add_u32 s12, s12, s3
; G_GFX9-NEXT: s_addc_u32 s13, s13, 0
; G_GFX9-NEXT: s_add_u32 s8, s0, 36
; G_GFX9-NEXT: s_addc_u32 s9, s1, 0
; G_GFX9-NEXT: s_getpc_b64 s[0:1]
; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
; G_GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; G_GFX9-NEXT: s_mov_b64 s[0:1], s[8:9]
; G_GFX9-NEXT: s_mov_b64 s[0:1], s[12:13]
; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
; G_GFX9-NEXT: v_mov_b32_e32 v1, 8
; G_GFX9-NEXT: s_mov_b64 s[2:3], s[10:11]
; G_GFX9-NEXT: s_mov_b64 s[2:3], s[14:15]
; G_GFX9-NEXT: s_mov_b32 s32, 0
; G_GFX9-NEXT: ds_write_b32 v1, v0
; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -95,21 +98,22 @@ define amdgpu_kernel void @withcall() {
;
; G_GFX10-LABEL: withcall:
; G_GFX10: ; %bb.0:
; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; G_GFX10-NEXT: s_mov_b32 s10, -1
; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000
; G_GFX10-NEXT: s_add_u32 s8, s8, s3
; G_GFX10-NEXT: s_addc_u32 s9, s9, 0
; G_GFX10-NEXT: s_mov_b64 s[6:7], s[0:1]
; G_GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; G_GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; G_GFX10-NEXT: s_mov_b32 s14, -1
; G_GFX10-NEXT: s_mov_b32 s15, 0x31c16000
; G_GFX10-NEXT: s_add_u32 s12, s12, s3
; G_GFX10-NEXT: s_addc_u32 s13, s13, 0
; G_GFX10-NEXT: s_add_u32 s8, s0, 36
; G_GFX10-NEXT: s_addc_u32 s9, s1, 0
; G_GFX10-NEXT: s_getpc_b64 s[0:1]
; G_GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
; G_GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
; G_GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; G_GFX10-NEXT: v_mov_b32_e32 v1, 8
; G_GFX10-NEXT: s_mov_b64 s[0:1], s[8:9]
; G_GFX10-NEXT: s_mov_b64 s[2:3], s[10:11]
; G_GFX10-NEXT: s_mov_b64 s[0:1], s[12:13]
; G_GFX10-NEXT: s_mov_b64 s[2:3], s[14:15]
; G_GFX10-NEXT: s_mov_b32 s32, 0
; G_GFX10-NEXT: ds_write_b32 v1, v0
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)

View File

@ -1,4 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; XFAIL: *
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=DEFAULTSIZE,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=DEFAULTSIZE-V5,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s
@ -51,6 +54,42 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_endpgm
;
; DEFAULTSIZE-V5-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
; DEFAULTSIZE-V5: ; %bb.0: ; %entry
; DEFAULTSIZE-V5-NEXT: s_add_u32 s0, s0, s9
; DEFAULTSIZE-V5-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; DEFAULTSIZE-V5-NEXT: s_addc_u32 s1, s1, 0
; DEFAULTSIZE-V5-NEXT: s_movk_i32 s32, 0x400
; DEFAULTSIZE-V5-NEXT: s_mov_b32 s33, 0
; DEFAULTSIZE-V5-NEXT: s_waitcnt lgkmcnt(0)
; DEFAULTSIZE-V5-NEXT: s_cmp_lg_u32 s8, 0
; DEFAULTSIZE-V5-NEXT: s_cbranch_scc1 .LBB0_3
; DEFAULTSIZE-V5-NEXT: ; %bb.1: ; %bb.0
; DEFAULTSIZE-V5-NEXT: s_cmp_lg_u32 s9, 0
; DEFAULTSIZE-V5-NEXT: s_cbranch_scc1 .LBB0_3
; DEFAULTSIZE-V5-NEXT: ; %bb.2: ; %bb.1
; DEFAULTSIZE-V5-NEXT: s_add_i32 s6, s32, 0x1000
; DEFAULTSIZE-V5-NEXT: s_lshl_b32 s7, s10, 2
; DEFAULTSIZE-V5-NEXT: s_mov_b32 s32, s6
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v1, 0
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, s6
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v3, 1
; DEFAULTSIZE-V5-NEXT: s_add_i32 s6, s6, s7
; DEFAULTSIZE-V5-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; DEFAULTSIZE-V5-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, s6
; DEFAULTSIZE-V5-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; DEFAULTSIZE-V5-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0)
; DEFAULTSIZE-V5-NEXT: v_add_u32_e32 v0, v2, v0
; DEFAULTSIZE-V5-NEXT: s_waitcnt lgkmcnt(0)
; DEFAULTSIZE-V5-NEXT: global_store_dword v1, v0, s[4:5]
; DEFAULTSIZE-V5-NEXT: .LBB0_3: ; %bb.2
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v0, 0
; DEFAULTSIZE-V5-NEXT: global_store_dword v[0:1], v0, off
; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0)
; DEFAULTSIZE-V5-NEXT: s_endpgm
;
; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
@ -110,8 +149,8 @@ bb.2:
store volatile i32 0, i32 addrspace(1)* undef
ret void
}
; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
; DEFAULTSIZE: ; ScratchSize: 4112
; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 16
; DEFAULTSIZE: ; ScratchSize: 16
; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16
; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
; DEFAULTSIZE-V5: ; ScratchSize: 16
@ -154,6 +193,40 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_endpgm
;
; DEFAULTSIZE-V5-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
; DEFAULTSIZE-V5: ; %bb.0: ; %entry
; DEFAULTSIZE-V5-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
; DEFAULTSIZE-V5-NEXT: s_add_u32 s0, s0, s9
; DEFAULTSIZE-V5-NEXT: s_addc_u32 s1, s1, 0
; DEFAULTSIZE-V5-NEXT: s_movk_i32 s32, 0x1000
; DEFAULTSIZE-V5-NEXT: s_mov_b32 s33, 0
; DEFAULTSIZE-V5-NEXT: s_waitcnt lgkmcnt(0)
; DEFAULTSIZE-V5-NEXT: s_cmp_lg_u32 s6, 0
; DEFAULTSIZE-V5-NEXT: s_cbranch_scc1 .LBB1_2
; DEFAULTSIZE-V5-NEXT: ; %bb.1: ; %bb.0
; DEFAULTSIZE-V5-NEXT: s_add_i32 s6, s32, 0x1000
; DEFAULTSIZE-V5-NEXT: s_and_b32 s6, s6, 0xfffff000
; DEFAULTSIZE-V5-NEXT: s_lshl_b32 s7, s7, 2
; DEFAULTSIZE-V5-NEXT: s_mov_b32 s32, s6
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v1, 0
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, s6
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v3, 1
; DEFAULTSIZE-V5-NEXT: s_add_i32 s6, s6, s7
; DEFAULTSIZE-V5-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; DEFAULTSIZE-V5-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, s6
; DEFAULTSIZE-V5-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; DEFAULTSIZE-V5-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0)
; DEFAULTSIZE-V5-NEXT: v_add_u32_e32 v0, v2, v0
; DEFAULTSIZE-V5-NEXT: s_waitcnt lgkmcnt(0)
; DEFAULTSIZE-V5-NEXT: global_store_dword v1, v0, s[4:5]
; DEFAULTSIZE-V5-NEXT: .LBB1_2: ; %bb.1
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v0, 0
; DEFAULTSIZE-V5-NEXT: global_store_dword v[0:1], v0, off
; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0)
; DEFAULTSIZE-V5-NEXT: s_endpgm
;
; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
@ -206,8 +279,8 @@ bb.1:
ret void
}
; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
; DEFAULTSIZE: ; ScratchSize: 4160
; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 64
; DEFAULTSIZE: ; ScratchSize: 64
; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64
; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
; DEFAULTSIZE-V5: ; ScratchSize: 64
@ -253,6 +326,42 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
; MUBUF-NEXT: s_mov_b32 s33, s7
; MUBUF-NEXT: s_setpc_b64 s[30:31]
;
; DEFAULTSIZE-V5-LABEL: func_non_entry_block_static_alloca_align4:
; DEFAULTSIZE-V5: ; %bb.0: ; %entry
; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DEFAULTSIZE-V5-NEXT: s_mov_b32 s7, s33
; DEFAULTSIZE-V5-NEXT: s_mov_b32 s33, s32
; DEFAULTSIZE-V5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; DEFAULTSIZE-V5-NEXT: s_addk_i32 s32, 0x400
; DEFAULTSIZE-V5-NEXT: s_and_saveexec_b64 s[4:5], vcc
; DEFAULTSIZE-V5-NEXT: s_cbranch_execz .LBB2_3
; DEFAULTSIZE-V5-NEXT: ; %bb.1: ; %bb.0
; DEFAULTSIZE-V5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; DEFAULTSIZE-V5-NEXT: s_and_b64 exec, exec, vcc
; DEFAULTSIZE-V5-NEXT: s_cbranch_execz .LBB2_3
; DEFAULTSIZE-V5-NEXT: ; %bb.2: ; %bb.1
; DEFAULTSIZE-V5-NEXT: s_add_i32 s6, s32, 0x1000
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, 0
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v3, s6
; DEFAULTSIZE-V5-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, 1
; DEFAULTSIZE-V5-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
; DEFAULTSIZE-V5-NEXT: v_lshl_add_u32 v2, v4, 2, s6
; DEFAULTSIZE-V5-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; DEFAULTSIZE-V5-NEXT: v_and_b32_e32 v3, 0x3ff, v31
; DEFAULTSIZE-V5-NEXT: s_mov_b32 s32, s6
; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0)
; DEFAULTSIZE-V5-NEXT: v_add_u32_e32 v2, v2, v3
; DEFAULTSIZE-V5-NEXT: global_store_dword v[0:1], v2, off
; DEFAULTSIZE-V5-NEXT: .LBB2_3: ; %bb.2
; DEFAULTSIZE-V5-NEXT: s_or_b64 exec, exec, s[4:5]
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v0, 0
; DEFAULTSIZE-V5-NEXT: global_store_dword v[0:1], v0, off
; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0)
; DEFAULTSIZE-V5-NEXT: s_addk_i32 s32, 0xfc00
; DEFAULTSIZE-V5-NEXT: s_mov_b32 s33, s7
; DEFAULTSIZE-V5-NEXT: s_setpc_b64 s[30:31]
;
; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@ -349,6 +458,40 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
; MUBUF-NEXT: s_mov_b32 s33, s7
; MUBUF-NEXT: s_setpc_b64 s[30:31]
;
; DEFAULTSIZE-V5-LABEL: func_non_entry_block_static_alloca_align64:
; DEFAULTSIZE-V5: ; %bb.0: ; %entry
; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DEFAULTSIZE-V5-NEXT: s_mov_b32 s7, s33
; DEFAULTSIZE-V5-NEXT: s_add_i32 s33, s32, 0xfc0
; DEFAULTSIZE-V5-NEXT: s_and_b32 s33, s33, 0xfffff000
; DEFAULTSIZE-V5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; DEFAULTSIZE-V5-NEXT: s_addk_i32 s32, 0x2000
; DEFAULTSIZE-V5-NEXT: s_and_saveexec_b64 s[4:5], vcc
; DEFAULTSIZE-V5-NEXT: s_cbranch_execz .LBB3_2
; DEFAULTSIZE-V5-NEXT: ; %bb.1: ; %bb.0
; DEFAULTSIZE-V5-NEXT: s_add_i32 s6, s32, 0x1000
; DEFAULTSIZE-V5-NEXT: s_and_b32 s6, s6, 0xfffff000
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, 0
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v4, s6
; DEFAULTSIZE-V5-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, 1
; DEFAULTSIZE-V5-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
; DEFAULTSIZE-V5-NEXT: v_lshl_add_u32 v2, v3, 2, s6
; DEFAULTSIZE-V5-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; DEFAULTSIZE-V5-NEXT: v_and_b32_e32 v3, 0x3ff, v31
; DEFAULTSIZE-V5-NEXT: s_mov_b32 s32, s6
; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0)
; DEFAULTSIZE-V5-NEXT: v_add_u32_e32 v2, v2, v3
; DEFAULTSIZE-V5-NEXT: global_store_dword v[0:1], v2, off
; DEFAULTSIZE-V5-NEXT: .LBB3_2: ; %bb.1
; DEFAULTSIZE-V5-NEXT: s_or_b64 exec, exec, s[4:5]
; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v0, 0
; DEFAULTSIZE-V5-NEXT: global_store_dword v[0:1], v0, off
; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0)
; DEFAULTSIZE-V5-NEXT: s_addk_i32 s32, 0xe000
; DEFAULTSIZE-V5-NEXT: s_mov_b32 s33, s7
; DEFAULTSIZE-V5-NEXT: s_setpc_b64 s[30:31]
;
; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@ -406,3 +549,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ASSUME1024: {{.*}}
; DEFAULTSIZE: {{.*}}

View File

@ -77,7 +77,7 @@ declare i32 @foo(ptr addrspace(5)) #0
; ASM: buffer_store_dword
; ASM: buffer_store_dword
; ASM: s_swappc_b64
; ASM: ScratchSize: 16400
; ASM: ScratchSize: 16
define amdgpu_kernel void @call_private(ptr addrspace(1) %out, i32 %in) #0 {
entry:
%tmp = alloca [2 x i32], addrspace(5)

View File

@ -32,7 +32,7 @@ define void @tail_recursive_with_stack() {
; For an arbitrary recursive call, report a large number for unknown stack
; usage for code object v4 and older
; CHECK-LABEL: {{^}}calls_recursive:
; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}}
; CHECK: .amdhsa_private_segment_fixed_size 16{{$}}
;
; V5-LABEL: {{^}}calls_recursive:
; V5: .amdhsa_private_segment_fixed_size 0{{$}}
@ -56,7 +56,7 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
; in the kernel.
; CHECK-LABEL: {{^}}kernel_calls_tail_recursive:
; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
; CHECK: .amdhsa_private_segment_fixed_size 0{{$}}
;
; V5-LABEL: {{^}}kernel_calls_tail_recursive:
; V5: .amdhsa_private_segment_fixed_size 0{{$}}
@ -67,7 +67,7 @@ define amdgpu_kernel void @kernel_calls_tail_recursive() {
}
; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
; CHECK: .amdhsa_private_segment_fixed_size 8{{$}}
;
; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
; V5: .amdhsa_private_segment_fixed_size 8{{$}}

View File

@ -1,5 +1,5 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-lower-kernel-attributes -instcombine %s | FileCheck -enable-var-scope %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck -enable-var-scope %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S --amdhsa-code-object-version=4 -amdgpu-lower-kernel-attributes -instcombine %s | FileCheck -enable-var-scope %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S --amdhsa-code-object-version=4 -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck -enable-var-scope %s
target datalayout = "n32"

View File

@ -21,7 +21,7 @@ define internal fastcc void @unreachable() {
; GCN: s_endpgm
; GCN: .amdhsa_private_segment_fixed_size 0
; GCN-NOT: .amdhsa_uses_dynamic_stack 0
; GCN: .amdhsa_uses_dynamic_stack 0
; GCN-V5: .amdhsa_uses_dynamic_stack 0
define amdgpu_kernel void @entry() {
bb0:

View File

@ -13,24 +13,19 @@ define amdgpu_kernel void @kernel() {
; GCN-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s38, -1
; GCN-NEXT: s_mov_b32 s39, 0xe00000
; GCN-NEXT: v_writelane_b32 v40, s4, 0
; GCN-NEXT: s_add_u32 s36, s36, s11
; GCN-NEXT: v_writelane_b32 v40, s5, 1
; GCN-NEXT: s_add_u32 s36, s36, s9
; GCN-NEXT: s_addc_u32 s37, s37, 0
; GCN-NEXT: s_mov_b32 s14, s8
; GCN-NEXT: s_add_u32 s8, s2, 36
; GCN-NEXT: s_addc_u32 s9, s3, 0
; GCN-NEXT: s_mov_b64 s[10:11], s[4:5]
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
; GCN-NEXT: v_readlane_b32 s0, v40, 0
; GCN-NEXT: s_mov_b32 s13, s9
; GCN-NEXT: s_mov_b32 s12, s8
; GCN-NEXT: v_readlane_b32 s1, v40, 1
; GCN-NEXT: s_add_u32 s8, s0, 36
; GCN-NEXT: s_addc_u32 s9, s1, 0
; GCN-NEXT: s_getpc_b64 s[0:1]
; GCN-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GCN-NEXT: s_mov_b32 s14, s10
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: s_mov_b32 s13, s7
; GCN-NEXT: s_mov_b32 s12, s6
; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: s_mov_b64 s[0:1], s[36:37]
@ -38,7 +33,7 @@ define amdgpu_kernel void @kernel() {
; GCN-NEXT: s_mov_b64 s[2:3], s[38:39]
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: s_endpgm
call void @foo()
ret void

View File

@ -31,11 +31,11 @@ declare void @llvm.debugtrap() #1
; MESA-TRAP: .section .AMDGPU.config
; MESA-TRAP: .long 47180
; MESA-TRAP-NEXT: .long 208
; MESA-TRAP-NEXT: .long 204
; NOMESA-TRAP: .section .AMDGPU.config
; NOMESA-TRAP: .long 47180
; NOMESA-TRAP-NEXT: .long 144
; NOMESA-TRAP-NEXT: .long 140
; GCN-LABEL: {{^}}hsa_trap:
; HSA-TRAP: enable_trap_handler = 0

File diff suppressed because it is too large Load Diff

View File

@ -290,14 +290,14 @@ define hidden void @blam() {
; GCN-NEXT: v_writelane_b32 v40, s47, 15
; GCN-NEXT: v_writelane_b32 v40, s48, 16
; GCN-NEXT: v_writelane_b32 v40, s49, 17
; GCN-NEXT: s_mov_b64 s[34:35], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v41, v31
; GCN-NEXT: s_mov_b32 s44, s15
; GCN-NEXT: s_mov_b32 s45, s14
; GCN-NEXT: s_mov_b32 s46, s13
; GCN-NEXT: s_mov_b32 s47, s12
; GCN-NEXT: s_mov_b64 s[34:35], s[10:11]
; GCN-NEXT: s_mov_b64 s[36:37], s[8:9]
; GCN-NEXT: s_mov_b64 s[38:39], s[6:7]
; GCN-NEXT: s_mov_b64 s[36:37], s[10:11]
; GCN-NEXT: s_mov_b64 s[38:39], s[8:9]
; GCN-NEXT: s_mov_b64 s[40:41], s[4:5]
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
@ -354,9 +354,9 @@ define hidden void @blam() {
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b64 s[4:5], s[40:41]
; GCN-NEXT: s_mov_b64 s[6:7], s[38:39]
; GCN-NEXT: s_mov_b64 s[8:9], s[36:37]
; GCN-NEXT: s_mov_b64 s[10:11], s[34:35]
; GCN-NEXT: s_mov_b64 s[6:7], s[34:35]
; GCN-NEXT: s_mov_b64 s[8:9], s[38:39]
; GCN-NEXT: s_mov_b64 s[10:11], s[36:37]
; GCN-NEXT: s_mov_b32 s12, s47
; GCN-NEXT: s_mov_b32 s13, s46
; GCN-NEXT: s_mov_b32 s14, s45

View File

@ -265,7 +265,7 @@ SerializeToHsacoPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) {
// This constant must always match the default code object ABI version
// of the AMDGPU backend.
addControlConstant("__oclc_ABI_version", 400, 32);
addControlConstant("__oclc_ABI_version", 500, 32);
}
// Determine libraries we need to link - order matters due to dependencies