diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll index 8e00e363d438..a3b7178d9df1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll @@ -28,13 +28,13 @@ define i1 @fold_negate_intrinsic_test_mask_dbl(double %x) nounwind { ; Negative test: should not transform for variable test masks ; CHECK: @fold_negate_intrinsic_test_mask_neg_var ; CHECK: %[[X0:.*]] = alloca i32 -; CHECK: %[[X1:.*]] = load i32, i32* %[[X0]] +; CHECK: %[[X1:.*]] = load i32, i32 addrspace(5)* %[[X0]] ; CHECK: call i1 @llvm.amdgcn.class.f32(float %x, i32 %[[X1]]) ; CHECK: xor define i1 @fold_negate_intrinsic_test_mask_neg_var(float %x) nounwind { - %1 = alloca i32 - store i32 7, i32* %1 - %2 = load i32, i32* %1 + %1 = alloca i32, addrspace(5) + store i32 7, i32 addrspace(5)* %1 + %2 = load i32, i32 addrspace(5)* %1 %3 = call i1 @llvm.amdgcn.class.f32(float %x, i32 %2) %4 = xor i1 %3, -1 ret i1 %4 @@ -47,10 +47,10 @@ define i1 @fold_negate_intrinsic_test_mask_neg_var(float %x) nounwind { ; CHECK: store i1 %[[X1]] ; CHECK: %[[X2:.*]] = xor i1 %[[X1]] define i1 @fold_negate_intrinsic_test_mask_neg_multiple_uses(float %x) nounwind { - %y = alloca i1 + %y = alloca i1, addrspace(5) %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 7) %2 = xor i1 %1, -1 - store i1 %1, i1* %y + store i1 %1, i1 addrspace(5)* %y %3 = xor i1 %1, -1 ret i1 %2 } diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll index 253e183645aa..bf09acedb01b 100644 --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -12,15 +12,15 @@ define internal void @indirect() { define internal void @direct() { ; CHECK-LABEL: define {{[^@]+}}@direct ; CHECK-SAME: () #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8 -; CHECK-NEXT: store void ()* @indirect, void ()** [[FPTR]], align 8 -; CHECK-NEXT: [[FP:%.*]] = load void ()*, void ()** [[FPTR]], align 8 +; CHECK-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8, addrspace(5) +; CHECK-NEXT: store void ()* @indirect, void ()* addrspace(5)* [[FPTR]], align 8 +; CHECK-NEXT: [[FP:%.*]] = load void ()*, void ()* addrspace(5)* [[FPTR]], align 8 ; CHECK-NEXT: call void [[FP]]() ; CHECK-NEXT: ret void ; - %fptr = alloca void()* - store void()* @indirect, void()** %fptr - %fp = load void()*, void()** %fptr + %fptr = alloca void()*, addrspace(5) + store void()* @indirect, void()* addrspace(5)* %fptr + %fp = load void()*, void()* addrspace(5)* %fptr call void %fp() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 414d0652d52a..4ab4717d593f 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -16,23 +16,23 @@ define internal void @indirect() { define amdgpu_kernel void @test_simple_indirect_call() #0 { ; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] { -; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8 -; AKF_GCN-NEXT: store void ()* @indirect, void ()** [[FPTR]], align 8 -; AKF_GCN-NEXT: [[FP:%.*]] = load void ()*, void ()** [[FPTR]], align 8 +; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8, addrspace(5) +; AKF_GCN-NEXT: store void ()* @indirect, void ()* addrspace(5)* [[FPTR]], align 8 +; AKF_GCN-NEXT: [[FP:%.*]] = load void ()*, void ()* addrspace(5)* [[FPTR]], align 8 ; AKF_GCN-NEXT: call void [[FP]]() ; AKF_GCN-NEXT: ret void ; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { -; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8 -; ATTRIBUTOR_GCN-NEXT: store void ()* @indirect, void ()** [[FPTR]], align 8 -; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load void ()*, void ()** [[FPTR]], align 8 +; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8, addrspace(5) +; ATTRIBUTOR_GCN-NEXT: store void ()* @indirect, void ()* addrspace(5)* [[FPTR]], align 8 +; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load void ()*, void ()* addrspace(5)* [[FPTR]], align 8 ; ATTRIBUTOR_GCN-NEXT: call void [[FP]]() ; ATTRIBUTOR_GCN-NEXT: ret void ; - %fptr = alloca void()* - store void()* @indirect, void()** %fptr - %fp = load void()*, void()** %fptr + %fptr = alloca void()*, addrspace(5) + store void()* @indirect, void()* addrspace(5)* %fptr + %fp = load void()*, void()* addrspace(5)* %fptr call void %fp() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll index e6295744f970..5a830b942d5d 100644 --- a/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll @@ -8,19 +8,19 @@ define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %3, i32 addrspace(1)* %arrayidx13 ret void @@ -30,19 +30,19 @@ entry: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %3, i32 addrspace(1)* %arrayidx13 ret void @@ -54,19 +54,19 @@ entry: define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %3, i32 addrspace(1)* %arrayidx13 ret void @@ -77,19 +77,19 @@ entry: ; SI: alloca [5 x i32] define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %3, i32 addrspace(1)* %arrayidx13 ret void @@ -100,19 +100,19 @@ entry: ; SI: alloca [5 x i32] define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %3, i32 addrspace(1)* %arrayidx13 ret void @@ -124,21 +124,21 @@ entry: ; CI-NOT: alloca define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { entry: - %stack = alloca [42 x i8], align 4 + %stack = alloca [42 x i8], align 4, addrspace(5) %tmp = load i8, i8 addrspace(1)* %in, align 1 %tmp4 = sext i8 %tmp to i64 - %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp4 - store i8 4, i8* %arrayidx1, align 1 + %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4 + store i8 4, i8 addrspace(5)* %arrayidx1, align 1 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 %tmp5 = sext i8 %tmp1 to i64 - %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp5 - store i8 5, i8* %arrayidx3, align 1 - %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 0 - %tmp2 = load i8, i8* %arrayidx10, align 1 + %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5 + store i8 5, i8 addrspace(5)* %arrayidx3, align 1 + %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 0 + %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1 store i8 %tmp2, i8 addrspace(1)* %out, align 1 - %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 1 - %tmp3 = load i8, i8* %arrayidx12, align 1 + %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 1 + %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 ret void @@ -150,21 +150,21 @@ entry: define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { entry: - %stack = alloca [43 x i8], align 4 + %stack = alloca [43 x i8], align 4, addrspace(5) %tmp = load i8, i8 addrspace(1)* %in, align 1 %tmp4 = sext i8 %tmp to i64 - %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp4 - store i8 4, i8* %arrayidx1, align 1 + %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4 + store i8 4, i8 addrspace(5)* %arrayidx1, align 1 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 %tmp5 = sext i8 %tmp1 to i64 - %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp5 - store i8 5, i8* %arrayidx3, align 1 - %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 0 - %tmp2 = load i8, i8* %arrayidx10, align 1 + %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5 + store i8 5, i8 addrspace(5)* %arrayidx3, align 1 + %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 0 + %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1 store i8 %tmp2, i8 addrspace(1)* %out, align 1 - %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 1 - %tmp3 = load i8, i8* %arrayidx12, align 1 + %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 1 + %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 ret void @@ -176,21 +176,21 @@ entry: ; CI-NOT: alloca define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { entry: - %stack = alloca [32 x i8], align 4 + %stack = alloca [32 x i8], align 4, addrspace(5) %tmp = load i8, i8 addrspace(1)* %in, align 1 %tmp4 = sext i8 %tmp to i64 - %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp4 - store i8 4, i8* %arrayidx1, align 1 + %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4 + store i8 4, i8 addrspace(5)* %arrayidx1, align 1 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 %tmp5 = sext i8 %tmp1 to i64 - %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp5 - store i8 5, i8* %arrayidx3, align 1 - %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 0 - %tmp2 = load i8, i8* %arrayidx10, align 1 + %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5 + store i8 5, i8 addrspace(5)* %arrayidx3, align 1 + %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 0 + %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1 store i8 %tmp2, i8 addrspace(1)* %out, align 1 - %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 1 - %tmp3 = load i8, i8* %arrayidx12, align 1 + %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 1 + %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 ret void @@ -202,21 +202,21 @@ entry: define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { entry: - %stack = alloca [33 x i8], align 4 + %stack = alloca [33 x i8], align 4, addrspace(5) %tmp = load i8, i8 addrspace(1)* %in, align 1 %tmp4 = sext i8 %tmp to i64 - %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp4 - store i8 4, i8* %arrayidx1, align 1 + %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4 + store i8 4, i8 addrspace(5)* %arrayidx1, align 1 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 %tmp5 = sext i8 %tmp1 to i64 - %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp5 - store i8 5, i8* %arrayidx3, align 1 - %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 0 - %tmp2 = load i8, i8* %arrayidx10, align 1 + %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5 + store i8 5, i8 addrspace(5)* %arrayidx3, align 1 + %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 0 + %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1 store i8 %tmp2, i8 addrspace(1)* %out, align 1 - %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 1 - %tmp3 = load i8, i8* %arrayidx12, align 1 + %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 1 + %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 ret void @@ -228,21 +228,21 @@ entry: ; CI-NOT: alloca define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { entry: - %stack = alloca [28 x i8], align 4 + %stack = alloca [28 x i8], align 4, addrspace(5) %tmp = load i8, i8 addrspace(1)* %in, align 1 %tmp4 = sext i8 %tmp to i64 - %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp4 - store i8 4, i8* %arrayidx1, align 1 + %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4 + store i8 4, i8 addrspace(5)* %arrayidx1, align 1 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 %tmp5 = sext i8 %tmp1 to i64 - %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp5 - store i8 5, i8* %arrayidx3, align 1 - %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 0 - %tmp2 = load i8, i8* %arrayidx10, align 1 + %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5 + store i8 5, i8 addrspace(5)* %arrayidx3, align 1 + %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 0 + %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1 store i8 %tmp2, i8 addrspace(1)* %out, align 1 - %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 1 - %tmp3 = load i8, i8* %arrayidx12, align 1 + %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 1 + %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 ret void @@ -254,21 +254,21 @@ entry: define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { entry: - %stack = alloca [29 x i8], align 4 + %stack = alloca [29 x i8], align 4, addrspace(5) %tmp = load i8, i8 addrspace(1)* %in, align 1 %tmp4 = sext i8 %tmp to i64 - %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp4 - store i8 4, i8* %arrayidx1, align 1 + %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4 + store i8 4, i8 addrspace(5)* %arrayidx1, align 1 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 %tmp5 = sext i8 %tmp1 to i64 - %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp5 - store i8 5, i8* %arrayidx3, align 1 - %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 0 - %tmp2 = load i8, i8* %arrayidx10, align 1 + %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5 + store i8 5, i8 addrspace(5)* %arrayidx3, align 1 + %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 0 + %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1 store i8 %tmp2, i8 addrspace(1)* %out, align 1 - %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 1 - %tmp3 = load i8, i8* %arrayidx12, align 1 + %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 1 + %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 ret void diff --git a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll index 42416aa96136..4bf3dbd4dc75 100644 --- a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll +++ b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll @@ -6,7 +6,7 @@ ; FUNC-LABEL: @test_kernel( ; R600-LABEL: entry ; R600-NOT: call i8 addrspace(1)* @__printf_alloc -; R600: call i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str, i32 0, i32 0), i8* %arraydecay, i32 %n) +; R600: call i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str, i32 0, i32 0), i8 addrspace(5)* %arraydecay, i32 %n) ; GCN-LABEL: entry ; GCN: call i8 addrspace(1)* @__printf_alloc ; GCN-LABEL: entry.split @@ -15,7 +15,7 @@ ; GCN: %PrintBuffIdCast = bitcast i8 addrspace(1)* %PrintBuffID to i32 addrspace(1)* ; GCN: store i32 1, i32 addrspace(1)* %PrintBuffIdCast ; GCN: %PrintBuffGep = getelementptr i8, i8 addrspace(1)* %printf_alloc_fn, i32 4 -; GCN: %PrintArgPtr = ptrtoint i8* %arraydecay to i64 +; GCN: %PrintArgPtr = ptrtoint i8 addrspace(5)* %arraydecay to i64 ; GCN: %PrintBuffPtrCast = bitcast i8 addrspace(1)* %PrintBuffGep to i64 addrspace(1)* ; GCN: store i64 %PrintArgPtr, i64 addrspace(1)* %PrintBuffPtrCast ; GCN: %PrintBuffNextPtr = getelementptr i8, i8 addrspace(1)* %PrintBuffGep, i32 8 @@ -26,9 +26,9 @@ define amdgpu_kernel void @test_kernel(i32 %n) { entry: - %str = alloca [9 x i8], align 1 - %arraydecay = getelementptr inbounds [9 x i8], [9 x i8]* %str, i32 0, i32 0 - %call1 = call i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str, i32 0, i32 0), i8* %arraydecay, i32 %n) + %str = alloca [9 x i8], align 1, addrspace(5) + %arraydecay = getelementptr inbounds [9 x i8], [9 x i8] addrspace(5)* %str, i32 0, i32 0 + %call1 = call i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str, i32 0, i32 0), i8 addrspace(5)* %arraydecay, i32 %n) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll index e199d5b5df25..1d55c9ee074a 100644 --- a/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll +++ b/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll @@ -14,23 +14,23 @@ ; CHECK-NEXT: OR_INT define amdgpu_kernel void @_Z9chk1D_512v() #0 { entry: - %a0 = alloca i32, align 4 - %b0 = alloca i32, align 4 - %c0 = alloca i32, align 4 - %d0 = alloca i32, align 4 - %a1 = alloca i32, align 4 - %b1 = alloca i32, align 4 - %c1 = alloca i32, align 4 - %d1 = alloca i32, align 4 - %data = alloca i32, align 4 - %0 = load i32, i32* %a0, align 4 - %1 = load i32, i32* %b0, align 4 + %a0 = alloca i32, align 4, addrspace(5) + %b0 = alloca i32, align 4, addrspace(5) + %c0 = alloca i32, align 4, addrspace(5) + %d0 = alloca i32, align 4, addrspace(5) + %a1 = alloca i32, align 4, addrspace(5) + %b1 = alloca i32, align 4, addrspace(5) + %c1 = alloca i32, align 4, addrspace(5) + %d1 = alloca i32, align 4, addrspace(5) + %data = alloca i32, align 4, addrspace(5) + %0 = load i32, i32 addrspace(5)* %a0, align 4 + %1 = load i32, i32 addrspace(5)* %b0, align 4 %cmp = icmp ne i32 %0, %1 br i1 %cmp, label %land.lhs.true, label %if.else land.lhs.true: ; preds = %entry - %2 = load i32, i32* %c0, align 4 - %3 = load i32, i32* %d0, align 4 + %2 = load i32, i32 addrspace(5)* %c0, align 4 + %3 = load i32, i32 addrspace(5)* %d0, align 4 %cmp1 = icmp ne i32 %2, %3 br i1 %cmp1, label %if.then, label %if.else @@ -38,18 +38,18 @@ if.then: ; preds = %land.lhs.true br label %if.end if.else: ; preds = %land.lhs.true, %entry - store i32 1, i32* %data, align 4 + store i32 1, i32 addrspace(5)* %data, align 4 br label %if.end if.end: ; preds = %if.else, %if.then - %4 = load i32, i32* %a1, align 4 - %5 = load i32, i32* %b1, align 4 + %4 = load i32, i32 addrspace(5)* %a1, align 4 + %5 = load i32, i32 addrspace(5)* %b1, align 4 %cmp2 = icmp ne i32 %4, %5 br i1 %cmp2, label %land.lhs.true3, label %if.else6 land.lhs.true3: ; preds = %if.end - %6 = load i32, i32* %c1, align 4 - %7 = load i32, i32* %d1, align 4 + %6 = load i32, i32 addrspace(5)* %c1, align 4 + %7 = load i32, i32 addrspace(5)* %d1, align 4 %cmp4 = icmp ne i32 %6, %7 br i1 %cmp4, label %if.then5, label %if.else6 @@ -57,7 +57,7 @@ if.then5: ; preds = %land.lhs.true3 br label %if.end7 if.else6: ; preds = %land.lhs.true3, %if.end - store i32 1, i32* %data, align 4 + store i32 1, i32 addrspace(5)* %data, align 4 br label %if.end7 if.end7: ; preds = %if.else6, %if.then5 diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll index 7f5f6180f473..7f137d5ab751 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll @@ -5,15 +5,15 @@ ; CHECK-LABEL: @invalid_bitcast_addrspace( ; CHECK: getelementptr inbounds [256 x [1 x i32]], [256 x [1 x i32]] addrspace(3)* @invalid_bitcast_addrspace.data, i32 0, i32 %14 ; CHECK: bitcast [1 x i32] addrspace(3)* %{{[0-9]+}} to half addrspace(3)* -; CHECK: addrspacecast half addrspace(3)* %tmp to half addrspace(4)* -; CHECK: bitcast half addrspace(4)* %tmp1 to <2 x i16> addrspace(4)* +; CHECK: addrspacecast half addrspace(3)* %tmp to half* +; CHECK: bitcast half* %tmp1 to <2 x i16>* define amdgpu_kernel void @invalid_bitcast_addrspace() #0 { entry: - %data = alloca [1 x i32], align 4 - %tmp = bitcast [1 x i32]* %data to half* - %tmp1 = addrspacecast half* %tmp to half addrspace(4)* - %tmp2 = bitcast half addrspace(4)* %tmp1 to <2 x i16> addrspace(4)* - %tmp3 = load <2 x i16>, <2 x i16> addrspace(4)* %tmp2, align 2 + %data = alloca [1 x i32], addrspace(5) + %tmp = bitcast [1 x i32] addrspace(5)* %data to half addrspace(5)* + %tmp1 = addrspacecast half addrspace(5)* %tmp to half* + %tmp2 = bitcast half* %tmp1 to <2 x i16>* + %tmp3 = load <2 x i16>, <2 x i16>* %tmp2, align 2 %tmp4 = bitcast <2 x i16> %tmp3 to <2 x half> ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll index 3a647a737e5e..5b1959801d87 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll @@ -16,19 +16,19 @@ define amdgpu_vs void @promote_1d_aggr() #0 { ; CHECK-LABEL: @promote_1d_aggr( -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[F1:%.*]] = alloca [1 x float], align 4 +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +; CHECK-NEXT: [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5) ; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], [[BLOCK]] addrspace(1)* @block, i32 0, i32 1 ; CHECK-NEXT: [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4 -; CHECK-NEXT: store i32 [[FOO1]], i32* [[I]], align 4 +; CHECK-NEXT: store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4 ; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [[BLOCK]], [[BLOCK]] addrspace(1)* @block, i32 0, i32 0 ; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], [1 x float] addrspace(1)* [[FOO2]], align 4 -; CHECK-NEXT: store [1 x float] [[FOO3]], [1 x float]* [[F1]], align 4 -; CHECK-NEXT: [[FOO4:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], [1 x float]* [[F1]], i32 0, i32 [[FOO4]] -; CHECK-NEXT: [[FOO6:%.*]] = load float, float* [[FOO5]], align 4 -; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16 -; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, <4 x float>* [[FOO7]], align 16 +; CHECK-NEXT: store [1 x float] [[FOO3]], [1 x float] addrspace(5)* [[F1]], align 4 +; CHECK-NEXT: [[FOO4:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4 +; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], [1 x float] addrspace(5)* [[F1]], i32 0, i32 [[FOO4]] +; CHECK-NEXT: [[FOO6:%.*]] = load float, float addrspace(5)* [[FOO5]], align 4 +; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5) +; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, <4 x float> addrspace(5)* [[FOO7]], align 16 ; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0 ; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1 ; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2 @@ -37,19 +37,19 @@ define amdgpu_vs void @promote_1d_aggr() #0 { ; CHECK-NEXT: store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16 ; CHECK-NEXT: ret void ; - %i = alloca i32 - %f1 = alloca [1 x float] + %i = alloca i32, addrspace(5) + %f1 = alloca [1 x float], addrspace(5) %foo = getelementptr %Block, %Block addrspace(1)* @block, i32 0, i32 1 %foo1 = load i32, i32 addrspace(1)* %foo - store i32 %foo1, i32* %i + store i32 %foo1, i32 addrspace(5)* %i %foo2 = getelementptr %Block, %Block addrspace(1)* @block, i32 0, i32 0 %foo3 = load [1 x float], [1 x float] addrspace(1)* %foo2 - store [1 x float] %foo3, [1 x float]* %f1 - %foo4 = load i32, i32* %i - %foo5 = getelementptr [1 x float], [1 x float]* %f1, i32 0, i32 %foo4 - %foo6 = load float, float* %foo5 - %foo7 = alloca <4 x float> - %foo8 = load <4 x float>, <4 x float>* %foo7 + store [1 x float] %foo3, [1 x float] addrspace(5)* %f1 + %foo4 = load i32, i32 addrspace(5)* %i + %foo5 = getelementptr [1 x float], [1 x float] addrspace(5)* %f1, i32 0, i32 %foo4 + %foo6 = load float, float addrspace(5)* %foo5 + %foo7 = alloca <4 x float>, addrspace(5) + %foo8 = load <4 x float>, <4 x float> addrspace(5)* %foo7 %foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0 %foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1 %foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2 @@ -64,42 +64,42 @@ define amdgpu_vs void @promote_1d_aggr() #0 { define amdgpu_vs void @promote_store_aggr() #0 { ; CHECK-LABEL: @promote_store_aggr( -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4 +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5) ; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK2:%.*]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 0 ; CHECK-NEXT: [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4 -; CHECK-NEXT: store i32 [[FOO1]], i32* [[I]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = load i32, i32* [[I]], align 4 +; CHECK-NEXT: store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4 +; CHECK-NEXT: [[FOO2:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float -; CHECK-NEXT: [[FOO4:%.*]] = getelementptr [2 x float], [2 x float]* [[F1]], i32 0, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x float]* [[F1]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 8 +; CHECK-NEXT: [[FOO4:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[FOO3]], i32 0 -; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP1]], align 8 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], [2 x float]* [[F1]], i32 0, i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x float]* [[F1]] to <2 x float>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 8 +; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float> addrspace(5)* [[TMP1]], align 8 +; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP4]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float 2.000000e+00, i64 1 -; CHECK-NEXT: store <2 x float> [[TMP6]], <2 x float>* [[TMP4]], align 8 -; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], [2 x float]* [[F1]], align 4 +; CHECK-NEXT: store <2 x float> [[TMP6]], <2 x float> addrspace(5)* [[TMP4]], align 8 +; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], [2 x float] addrspace(5)* [[F1]], align 4 ; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 1 ; CHECK-NEXT: store [2 x float] [[FOO6]], [2 x float] addrspace(1)* [[FOO7]], align 4 ; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0 ; CHECK-NEXT: store <4 x float> , <4 x float> addrspace(1)* [[FOO8]], align 16 ; CHECK-NEXT: ret void ; - %i = alloca i32 - %f1 = alloca [2 x float] + %i = alloca i32, addrspace(5) + %f1 = alloca [2 x float], addrspace(5) %foo = getelementptr %Block2, %Block2 addrspace(1)* @block2, i32 0, i32 0 %foo1 = load i32, i32 addrspace(1)* %foo - store i32 %foo1, i32* %i - %foo2 = load i32, i32* %i + store i32 %foo1, i32 addrspace(5)* %i + %foo2 = load i32, i32 addrspace(5)* %i %foo3 = sitofp i32 %foo2 to float - %foo4 = getelementptr [2 x float], [2 x float]* %f1, i32 0, i32 0 - store float %foo3, float* %foo4 - %foo5 = getelementptr [2 x float], [2 x float]* %f1, i32 0, i32 1 - store float 2.000000e+00, float* %foo5 - %foo6 = load [2 x float], [2 x float]* %f1 + %foo4 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 0 + store float %foo3, float addrspace(5)* %foo4 + %foo5 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 1 + store float 2.000000e+00, float addrspace(5)* %foo5 + %foo6 = load [2 x float], [2 x float] addrspace(5)* %f1 %foo7 = getelementptr %Block2, %Block2 addrspace(1)* @block2, i32 0, i32 1 store [2 x float] %foo6, [2 x float] addrspace(1)* %foo7 %foo8 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0 @@ -112,21 +112,21 @@ define amdgpu_vs void @promote_store_aggr() #0 { define amdgpu_vs void @promote_load_from_store_aggr() #0 { ; CHECK-LABEL: @promote_load_from_store_aggr( -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4 +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5) ; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], [[BLOCK3]] addrspace(1)* @block3, i32 0, i32 1 ; CHECK-NEXT: [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4 -; CHECK-NEXT: store i32 [[FOO1]], i32* [[I]], align 4 +; CHECK-NEXT: store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4 ; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [[BLOCK3]], [[BLOCK3]] addrspace(1)* @block3, i32 0, i32 0 ; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], [2 x float] addrspace(1)* [[FOO2]], align 4 -; CHECK-NEXT: store [2 x float] [[FOO3]], [2 x float]* [[F1]], align 4 -; CHECK-NEXT: [[FOO4:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], [2 x float]* [[F1]], i32 0, i32 [[FOO4]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x float]* [[F1]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 8 +; CHECK-NEXT: store [2 x float] [[FOO3]], [2 x float] addrspace(5)* [[F1]], align 4 +; CHECK-NEXT: [[FOO4:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4 +; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 [[FOO4]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO4]] -; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16 -; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, <4 x float>* [[FOO7]], align 16 +; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5) +; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, <4 x float> addrspace(5)* [[FOO7]], align 16 ; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP3]], i32 0 ; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1 ; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2 @@ -135,19 +135,19 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 { ; CHECK-NEXT: store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16 ; CHECK-NEXT: ret void ; - %i = alloca i32 - %f1 = alloca [2 x float] + %i = alloca i32, addrspace(5) + %f1 = alloca [2 x float], addrspace(5) %foo = getelementptr %Block3, %Block3 addrspace(1)* @block3, i32 0, i32 1 %foo1 = load i32, i32 addrspace(1)* %foo - store i32 %foo1, i32* %i + store i32 %foo1, i32 addrspace(5)* %i %foo2 = getelementptr %Block3, %Block3 addrspace(1)* @block3, i32 0, i32 0 %foo3 = load [2 x float], [2 x float] addrspace(1)* %foo2 - store [2 x float] %foo3, [2 x float]* %f1 - %foo4 = load i32, i32* %i - %foo5 = getelementptr [2 x float], [2 x float]* %f1, i32 0, i32 %foo4 - %foo6 = load float, float* %foo5 - %foo7 = alloca <4 x float> - %foo8 = load <4 x float>, <4 x float>* %foo7 + store [2 x float] %foo3, [2 x float] addrspace(5)* %f1 + %foo4 = load i32, i32 addrspace(5)* %i + %foo5 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 %foo4 + %foo6 = load float, float addrspace(5)* %foo5 + %foo7 = alloca <4 x float>, addrspace(5) + %foo8 = load <4 x float>, <4 x float> addrspace(5)* %foo7 %foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0 %foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1 %foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2 @@ -162,35 +162,35 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 { define amdgpu_ps void @promote_double_aggr() #0 { ; CHECK-LABEL: @promote_double_aggr( -; CHECK-NEXT: [[S:%.*]] = alloca [2 x double], align 8 +; CHECK-NEXT: [[S:%.*]] = alloca [2 x double], align 8, addrspace(5) ; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 0 ; CHECK-NEXT: [[FOO1:%.*]] = load double, double addrspace(1)* [[FOO]], align 8 ; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 1 ; CHECK-NEXT: [[FOO3:%.*]] = load double, double addrspace(1)* [[FOO2]], align 8 ; CHECK-NEXT: [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0 ; CHECK-NEXT: [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1 -; CHECK-NEXT: store [2 x double] [[FOO5]], [2 x double]* [[S]], align 8 -; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 16 +; CHECK-NEXT: store [2 x double] [[FOO5]], [2 x double] addrspace(5)* [[S]], align 8 +; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP1]], align 16 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i64 1 -; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 16 +; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP4]], align 16 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 ; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[TMP3]], [[TMP6]] -; CHECK-NEXT: [[FOO11:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>* -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 16 +; CHECK-NEXT: [[FOO11:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)* +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP7]], align 16 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[FOO10]], i32 0 -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP7]], align 16 -; CHECK-NEXT: [[FOO12:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>* -; CHECK-NEXT: [[TMP11:%.*]] = load <2 x double>, <2 x double>* [[TMP10]], align 16 +; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double> addrspace(5)* [[TMP7]], align 16 +; CHECK-NEXT: [[FOO12:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)* +; CHECK-NEXT: [[TMP11:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP10]], align 16 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0 -; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>* -; CHECK-NEXT: [[TMP14:%.*]] = load <2 x double>, <2 x double>* [[TMP13]], align 16 +; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)* +; CHECK-NEXT: [[TMP14:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP13]], align 16 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[TMP14]], i64 1 ; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[TMP12]], [[TMP15]] ; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float @@ -201,25 +201,25 @@ define amdgpu_ps void @promote_double_aggr() #0 { ; CHECK-NEXT: store <4 x float> [[FOO21]], <4 x float> addrspace(1)* @frag_color, align 16 ; CHECK-NEXT: ret void ; - %s = alloca [2 x double] + %s = alloca [2 x double], addrspace(5) %foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 0 %foo1 = load double, double addrspace(1)* %foo %foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 1 %foo3 = load double, double addrspace(1)* %foo2 %foo4 = insertvalue [2 x double] undef, double %foo1, 0 %foo5 = insertvalue [2 x double] %foo4, double %foo3, 1 - store [2 x double] %foo5, [2 x double]* %s - %foo6 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 1 - %foo7 = load double, double* %foo6 - %foo8 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 1 - %foo9 = load double, double* %foo8 + store [2 x double] %foo5, [2 x double] addrspace(5)* %s + %foo6 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1 + %foo7 = load double, double addrspace(5)* %foo6 + %foo8 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1 + %foo9 = load double, double addrspace(5)* %foo8 %foo10 = fadd double %foo7, %foo9 - %foo11 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 0 - store double %foo10, double* %foo11 - %foo12 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 0 - %foo13 = load double, double* %foo12 - %foo14 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 1 - %foo15 = load double, double* %foo14 + %foo11 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 0 + store double %foo10, double addrspace(5)* %foo11 + %foo12 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 0 + %foo13 = load double, double addrspace(5)* %foo12 + %foo14 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1 + %foo15 = load double, double addrspace(5)* %foo14 %foo16 = fadd double %foo13, %foo15 %foo17 = fptrunc double %foo16 to float %foo18 = insertelement <4 x float> undef, float %foo17, i32 0 @@ -253,6 +253,6 @@ define amdgpu_kernel void @alloca_struct() #0 { ; CHECK-NEXT: ret void ; entry: - %alloca = alloca [2 x %struct], align 4 + %alloca = alloca [2 x %struct], align 4, addrspace(5) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll index a76d72fb6934..9d6f10bda03f 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll @@ -4,44 +4,44 @@ ; number of elements. ; CHECK-LABEL: @array_alloca( -; CHECK: %stack = alloca i32, i32 5, align 4 +; CHECK: %stack = alloca i32, i32 5, align 4, addrspace(5) define amdgpu_kernel void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: - %stack = alloca i32, i32 5, align 4 + %stack = alloca i32, i32 5, align 4, addrspace(5) %ld0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %stack, i32 %ld0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds i32, i32* %stack, i32 %ld1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds i32, i32* %stack, i32 0 - %ld2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 0 + %ld2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %ld2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds i32, i32* %stack, i32 1 - %ld3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 1 + %ld3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %ld3, i32 addrspace(1)* %arrayidx13 ret void } ; CHECK-LABEL: @array_alloca_dynamic( -; CHECK: %stack = alloca i32, i32 %size, align 4 +; CHECK: %stack = alloca i32, i32 %size, align 4, addrspace(5) define amdgpu_kernel void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 { entry: - %stack = alloca i32, i32 %size, align 4 + %stack = alloca i32, i32 %size, align 4, addrspace(5) %ld0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %stack, i32 %ld0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds i32, i32* %stack, i32 %ld1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds i32, i32* %stack, i32 0 - %ld2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 0 + %ld2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %ld2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds i32, i32* %stack, i32 1 - %ld3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 1 + %ld3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %ld3, i32 addrspace(1)* %arrayidx13 ret void diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll index 2ce0e07954a6..548ec44daf8f 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll @@ -1,28 +1,28 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck %s -declare void @llvm.memcpy.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0 -declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i1) #0 -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) #0 +declare void @llvm.memcpy.p5i8.p1i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0 +declare void @llvm.memcpy.p1i8.p5i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(5)* nocapture, i32, i1) #0 +declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture, i64, i1) #0 -declare void @llvm.memmove.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0 -declare void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i1) #0 -declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) #0 +declare void @llvm.memmove.p5i8.p1i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0 +declare void @llvm.memmove.p1i8.p5i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(5)* nocapture, i32, i1) #0 +declare void @llvm.memmove.p5i8.p5i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture, i64, i1) #0 -declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1) #0 +declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture, i8, i32, i1) #0 -declare i32 @llvm.objectsize.i32.p0i8(i8*, i1, i1, i1) #1 +declare i32 @llvm.objectsize.i32.p5i8(i8 addrspace(5)*, i1, i1, i1) #1 ; CHECK-LABEL: @promote_with_memcpy( ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false) ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(3)* align 4 %alloca.bc, i32 68, i1 false) define amdgpu_kernel void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %alloca = alloca [17 x i32], align 4 - %alloca.bc = bitcast [17 x i32]* %alloca to i8* + %alloca = alloca [17 x i32], align 4, addrspace(5) + %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)* %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memcpy.p0i8.p1i8.i32(i8* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false) - call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* align 4 %out.bc, i8* align 4 %alloca.bc, i32 68, i1 false) + call void @llvm.memcpy.p5i8.p1i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false) + call void @llvm.memcpy.p1i8.p5i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(5)* align 4 %alloca.bc, i32 68, i1 false) ret void } @@ -31,12 +31,12 @@ define amdgpu_kernel void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrs ; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false) ; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(3)* align 4 %alloca.bc, i32 68, i1 false) define amdgpu_kernel void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %alloca = alloca [17 x i32], align 4 - %alloca.bc = bitcast [17 x i32]* %alloca to i8* + %alloca = alloca [17 x i32], align 4, addrspace(5) + %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)* %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memmove.p0i8.p1i8.i32(i8* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false) - call void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* align 4 %out.bc, i8* align 4 %alloca.bc, i32 68, i1 false) + call void @llvm.memmove.p5i8.p1i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false) + call void @llvm.memmove.p1i8.p5i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(5)* align 4 %alloca.bc, i32 68, i1 false) ret void } @@ -44,11 +44,11 @@ define amdgpu_kernel void @promote_with_memmove(i32 addrspace(1)* %out, i32 addr ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 7, i32 68, i1 false) define amdgpu_kernel void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %alloca = alloca [17 x i32], align 4 - %alloca.bc = bitcast [17 x i32]* %alloca to i8* + %alloca = alloca [17 x i32], align 4, addrspace(5) + %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)* %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memset.p0i8.i32(i8* align 4 %alloca.bc, i8 7, i32 68, i1 false) + call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 7, i32 68, i1 false) ret void } @@ -56,9 +56,9 @@ define amdgpu_kernel void @promote_with_memset(i32 addrspace(1)* %out, i32 addrs ; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false, i1 false, i1 false) define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 { - %alloca = alloca [17 x i32], align 4 - %alloca.bc = bitcast [17 x i32]* %alloca to i8* - %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false, i1 false, i1 false) + %alloca = alloca [17 x i32], align 4, addrspace(5) + %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)* + %size = call i32 @llvm.objectsize.i32.p5i8(i8 addrspace(5)* %alloca.bc, i1 false, i1 false, i1 false) store i32 %size, i32 addrspace(1)* %out ret void } @@ -69,12 +69,12 @@ define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 { ; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false) define amdgpu_kernel void @promote_alloca_used_twice_in_memcpy(i32 %c) { entry: - %r = alloca double, align 8 - %arrayidx1 = getelementptr inbounds double, double* %r, i32 1 - %i = bitcast double* %arrayidx1 to i8* - %arrayidx2 = getelementptr inbounds double, double* %r, i32 %c - %i1 = bitcast double* %arrayidx2 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 dereferenceable(16) %i, i8* align 8 dereferenceable(16) %i1, i64 16, i1 false) + %r = alloca double, align 8, addrspace(5) + %arrayidx1 = getelementptr inbounds double, double addrspace(5)* %r, i32 1 + %i = bitcast double addrspace(5)* %arrayidx1 to i8 addrspace(5)* + %arrayidx2 = getelementptr inbounds double, double addrspace(5)* %r, i32 %c + %i1 = bitcast double addrspace(5)* %arrayidx2 to i8 addrspace(5)* + call void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* align 8 dereferenceable(16) %i, i8 addrspace(5)* align 8 dereferenceable(16) %i1, i64 16, i1 false) ret void } @@ -84,12 +84,12 @@ entry: ; CHECK: call void @llvm.memmove.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false) define amdgpu_kernel void @promote_alloca_used_twice_in_memmove(i32 %c) { entry: - %r = alloca double, align 8 - %arrayidx1 = getelementptr inbounds double, double* %r, i32 1 - %i = bitcast double* %arrayidx1 to i8* - %arrayidx2 = getelementptr inbounds double, double* %r, i32 %c - %i1 = bitcast double* %arrayidx2 to i8* - call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 dereferenceable(16) %i, i8* align 8 dereferenceable(16) %i1, i64 16, i1 false) + %r = alloca double, align 8, addrspace(5) + %arrayidx1 = getelementptr inbounds double, double addrspace(5)* %r, i32 1 + %i = bitcast double addrspace(5)* %arrayidx1 to i8 addrspace(5)* + %arrayidx2 = getelementptr inbounds double, double addrspace(5)* %r, i32 %c + %i1 = bitcast double addrspace(5)* %arrayidx2 to i8 addrspace(5)* + call void @llvm.memmove.p5i8.p5i8.i64(i8 addrspace(5)* align 8 dereferenceable(16) %i, i8 addrspace(5)* align 8 dereferenceable(16) %i1, i64 16, i1 false) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll index 0c5a76e980b2..2e8a15d99ccd 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll @@ -31,18 +31,18 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(i32 add ; CHECK-NEXT: ret void ; ; NOLDS-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer( -; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4 -; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[ALLOCA]], i32 0, i32 [[A:%.*]] -; NOLDS-NEXT: [[PTR1:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[ALLOCA]], i32 0, i32 [[B:%.*]] -; NOLDS-NEXT: [[CMP:%.*]] = icmp eq i32* [[PTR0]], [[PTR1]] +; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5) +; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[A:%.*]] +; NOLDS-NEXT: [[PTR1:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[B:%.*]] +; NOLDS-NEXT: [[CMP:%.*]] = icmp eq i32 addrspace(5)* [[PTR0]], [[PTR1]] ; NOLDS-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 ; NOLDS-NEXT: store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; NOLDS-NEXT: ret void ; - %alloca = alloca [16 x i32], align 4 - %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a - %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b - %cmp = icmp eq i32* %ptr0, %ptr1 + %alloca = alloca [16 x i32], align 4, addrspace(5) + %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a + %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b + %cmp = icmp eq i32 addrspace(5)* %ptr0, %ptr1 %zext = zext i1 %cmp to i32 store volatile i32 %zext, i32 addrspace(1)* %out ret void @@ -73,16 +73,16 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* % ; CHECK-NEXT: ret void ; ; NOLDS-LABEL: @lds_promoted_alloca_icmp_null_rhs( -; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4 -; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[ALLOCA]], i32 0, i32 [[A:%.*]] -; NOLDS-NEXT: [[CMP:%.*]] = icmp eq i32* [[PTR0]], null +; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5) +; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[A:%.*]] +; NOLDS-NEXT: [[CMP:%.*]] = icmp eq i32 addrspace(5)* [[PTR0]], null ; NOLDS-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 ; NOLDS-NEXT: store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; NOLDS-NEXT: ret void ; - %alloca = alloca [16 x i32], align 4 - %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a - %cmp = icmp eq i32* %ptr0, null + %alloca = alloca [16 x i32], align 4, addrspace(5) + %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a + %cmp = icmp eq i32 addrspace(5)* %ptr0, null %zext = zext i1 %cmp to i32 store volatile i32 %zext, i32 addrspace(1)* %out ret void @@ -113,16 +113,16 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* % ; CHECK-NEXT: ret void ; ; NOLDS-LABEL: @lds_promoted_alloca_icmp_null_lhs( -; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4 -; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[ALLOCA]], i32 0, i32 [[A:%.*]] -; NOLDS-NEXT: [[CMP:%.*]] = icmp eq i32* null, [[PTR0]] +; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5) +; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[A:%.*]] +; NOLDS-NEXT: [[CMP:%.*]] = icmp eq i32 addrspace(5)* null, [[PTR0]] ; NOLDS-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 ; NOLDS-NEXT: store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; NOLDS-NEXT: ret void ; - %alloca = alloca [16 x i32], align 4 - %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a - %cmp = icmp eq i32* null, %ptr0 + %alloca = alloca [16 x i32], align 4, addrspace(5) + %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a + %cmp = icmp eq i32 addrspace(5)* null, %ptr0 %zext = zext i1 %cmp to i32 store volatile i32 %zext, i32 addrspace(1)* %out ret void @@ -130,32 +130,32 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* % define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; CHECK-LABEL: @lds_promoted_alloca_icmp_unknown_ptr( -; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4 -; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[ALLOCA]], i32 0, i32 [[A:%.*]] -; CHECK-NEXT: [[PTR1:%.*]] = call i32* @get_unknown_pointer() -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32* [[PTR0]], [[PTR1]] +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5) +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[A:%.*]] +; CHECK-NEXT: [[PTR1:%.*]] = call i32 addrspace(5)* @get_unknown_pointer() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 addrspace(5)* [[PTR0]], [[PTR1]] ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 ; CHECK-NEXT: store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; NOLDS-LABEL: @lds_promoted_alloca_icmp_unknown_ptr( -; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4 -; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[ALLOCA]], i32 0, i32 [[A:%.*]] -; NOLDS-NEXT: [[PTR1:%.*]] = call i32* @get_unknown_pointer() -; NOLDS-NEXT: [[CMP:%.*]] = icmp eq i32* [[PTR0]], [[PTR1]] +; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5) +; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[A:%.*]] +; NOLDS-NEXT: [[PTR1:%.*]] = call i32 addrspace(5)* @get_unknown_pointer() +; NOLDS-NEXT: [[CMP:%.*]] = icmp eq i32 addrspace(5)* [[PTR0]], [[PTR1]] ; NOLDS-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 ; NOLDS-NEXT: store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; NOLDS-NEXT: ret void ; - %alloca = alloca [16 x i32], align 4 - %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a - %ptr1 = call i32* @get_unknown_pointer() - %cmp = icmp eq i32* %ptr0, %ptr1 + %alloca = alloca [16 x i32], align 4, addrspace(5) + %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a + %ptr1 = call i32 addrspace(5)* @get_unknown_pointer() + %cmp = icmp eq i32 addrspace(5)* %ptr0, %ptr1 %zext = zext i1 %cmp to i32 store volatile i32 %zext, i32 addrspace(1)* %out ret void } -declare i32* @get_unknown_pointer() #0 +declare i32 addrspace(5)* @get_unknown_pointer() #0 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll index 31ee3184c1dd..f56f2e51766c 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll @@ -15,20 +15,20 @@ ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4 define amdgpu_kernel void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 { entry: - %alloca = alloca [64 x i32], align 4 + %alloca = alloca [64 x i32], align 4, addrspace(5) br i1 undef, label %if, label %else if: - %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a + %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a br label %endif else: - %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %b + %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %b br label %endif endif: - %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ] - store i32 0, i32* %phi.ptr, align 4 + %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ] + store i32 0, i32 addrspace(5)* %phi.ptr, align 4 ret void } @@ -36,16 +36,16 @@ endif: ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ] define amdgpu_kernel void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 { entry: - %alloca = alloca [64 x i32], align 4 + %alloca = alloca [64 x i32], align 4, addrspace(5) br i1 undef, label %if, label %endif if: - %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a + %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a br label %endif endif: - %phi.ptr = phi i32* [ %arrayidx0, %if ], [ null, %entry ] - store i32 0, i32* %phi.ptr, align 4 + %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ null, %entry ] + store i32 0, i32 addrspace(5)* %phi.ptr, align 4 ret void } @@ -53,16 +53,16 @@ endif: ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ null, %entry ], [ %arrayidx0, %if ] define amdgpu_kernel void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 { entry: - %alloca = alloca [64 x i32], align 4 + %alloca = alloca [64 x i32], align 4, addrspace(5) br i1 undef, label %if, label %endif if: - %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a + %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a br label %endif endif: - %phi.ptr = phi i32* [ null, %entry ], [ %arrayidx0, %if ] - store i32 0, i32* %phi.ptr, align 4 + %phi.ptr = phi i32 addrspace(5)* [ null, %entry ], [ %arrayidx0, %if ] + store i32 0, i32 addrspace(5)* %phi.ptr, align 4 ret void } @@ -75,13 +75,13 @@ endif: ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4 define amdgpu_kernel void @one_phi_value(i32 %a) #0 { entry: - %alloca = alloca [64 x i32], align 4 - %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a + %alloca = alloca [64 x i32], align 4, addrspace(5) + %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a br label %exit exit: - %phi.ptr = phi i32* [ %arrayidx0, %entry ] - store i32 0, i32* %phi.ptr, align 4 + %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %entry ] + store i32 0, i32 addrspace(5)* %phi.ptr, align 4 ret void } @@ -89,30 +89,30 @@ exit: ; CHECK: %alloca = alloca [64 x i32], align 4 ; CHECK: if: -; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a +; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a ; CHECK: else: -; CHECK: %arrayidx1 = call i32* @get_unknown_pointer() +; CHECK: %arrayidx1 = call i32 addrspace(5)* @get_unknown_pointer() ; CHECK: endif: -; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ] -; CHECK: store i32 0, i32* %phi.ptr, align 4 +; CHECK: %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ] +; CHECK: store i32 0, i32 addrspace(5)* %phi.ptr, align 4 define amdgpu_kernel void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 { entry: - %alloca = alloca [64 x i32], align 4 + %alloca = alloca [64 x i32], align 4, addrspace(5) br i1 undef, label %if, label %else if: - %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a + %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a br label %endif else: - %arrayidx1 = call i32* @get_unknown_pointer() + %arrayidx1 = call i32 addrspace(5)* @get_unknown_pointer() br label %endif endif: - %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ] - store i32 0, i32* %phi.ptr, align 4 + %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ] + store i32 0, i32 addrspace(5)* %phi.ptr, align 4 ret void } @@ -133,12 +133,12 @@ endif: ; CHECK-LABEL: @ptr_induction_var_same_alloca( ; CHECK: %alloca = alloca [64 x i32], align 4 -; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] +; CHECK: phi i32 addrspace(5)* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] define amdgpu_kernel void @ptr_induction_var_same_alloca() #0 { entry: - %alloca = alloca [64 x i32], align 4 - %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2 - %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 48 + %alloca = alloca [64 x i32], align 4, addrspace(5) + %arrayidx = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 2 + %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 48 br label %for.body for.cond.cleanup: ; preds = %for.body @@ -146,11 +146,11 @@ for.cond.cleanup: ; preds = %for.body for.body: ; preds = %for.body, %entry %i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %p.08 = phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] - store i32 %i.09, i32* %p.08, align 4 - %incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1 + %p.08 = phi i32 addrspace(5)* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] + store i32 %i.09, i32 addrspace(5)* %p.08, align 4 + %incdec.ptr = getelementptr inbounds i32, i32 addrspace(5)* %p.08, i32 1 %inc = add nuw nsw i32 %i.09, 1 - %cmp = icmp eq i32* %incdec.ptr, %arrayidx1 + %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %arrayidx1 br i1 %cmp, label %for.cond.cleanup, label %for.body } @@ -170,14 +170,14 @@ for.body: ; preds = %for.body, %entry ; CHECK-LABEL: @ptr_induction_var_alloca_unknown( ; CHECK: %alloca = alloca [64 x i32], align 4 -; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ] -; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call +; CHECK: %p.08 = phi i32 addrspace(5)* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ] +; CHECK: %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %call define amdgpu_kernel void @ptr_induction_var_alloca_unknown() #0 { entry: - %alloca = alloca [64 x i32], align 4 - %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2 - %call = tail call i32* @get_unknown_pointer() #2 - %cmp.7 = icmp eq i32* %arrayidx, %call + %alloca = alloca [64 x i32], align 4, addrspace(5) + %arrayidx = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 2 + %call = tail call i32 addrspace(5)* @get_unknown_pointer() #2 + %cmp.7 = icmp eq i32 addrspace(5)* %arrayidx, %call br i1 %cmp.7, label %for.cond.cleanup, label %for.body.preheader for.body.preheader: ; preds = %entry @@ -191,14 +191,14 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo for.body: ; preds = %for.body, %for.body.preheader %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ] - store i32 %i.09, i32* %p.08, align 4 - %incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1 + %p.08 = phi i32 addrspace(5)* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ] + store i32 %i.09, i32 addrspace(5)* %p.08, align 4 + %incdec.ptr = getelementptr inbounds i32, i32 addrspace(5)* %p.08, i32 1 %inc = add nuw nsw i32 %i.09, 1 - %cmp = icmp eq i32* %incdec.ptr, %call + %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %call br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body } -declare i32* @get_unknown_pointer() #0 +declare i32 addrspace(5)* @get_unknown_pointer() #0 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll index 53d7dc9a9779..5d35f1d73910 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll @@ -2,26 +2,26 @@ ; CHECK-LABEL: @volatile_load( ; CHECK: alloca [4 x i32] -; CHECK: load volatile i32, i32* +; CHECK: load volatile i32, i32 addrspace(5)* define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: - %stack = alloca [4 x i32], align 4 + %stack = alloca [4 x i32], align 4, addrspace(5) %tmp = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32]* %stack, i32 0, i32 %tmp - %load = load volatile i32, i32* %arrayidx1 + %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %tmp + %load = load volatile i32, i32 addrspace(5)* %arrayidx1 store i32 %load, i32 addrspace(1)* %out ret void } ; CHECK-LABEL: @volatile_store( ; CHECK: alloca [4 x i32] -; CHECK: store volatile i32 %tmp, i32* +; CHECK: store volatile i32 %tmp, i32 addrspace(5)* define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: - %stack = alloca [4 x i32], align 4 + %stack = alloca [4 x i32], align 4, addrspace(5) %tmp = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32]* %stack, i32 0, i32 %tmp - store volatile i32 %tmp, i32* %arrayidx1 + %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %tmp + store volatile i32 %tmp, i32 addrspace(5)* %arrayidx1 ret void } @@ -32,11 +32,11 @@ entry: ; CHECK: load volatile double define amdgpu_kernel void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 { bb: - %tmp = alloca double, align 8 - store double 0.000000e+00, double* %tmp, align 8 + %tmp = alloca double, align 8, addrspace(5) + store double 0.000000e+00, double addrspace(5)* %tmp, align 8 - %tmp4 = load double, double* %tmp, align 8 - %tmp5 = load volatile double, double* %tmp, align 8 + %tmp4 = load double, double addrspace(5)* %tmp, align 8 + %tmp5 = load volatile double, double addrspace(5)* %tmp, align 8 store double %tmp4, double addrspace(1)* %arg ret void diff --git a/llvm/test/CodeGen/AMDGPU/skip-promote-alloca-vector-users.ll b/llvm/test/CodeGen/AMDGPU/skip-promote-alloca-vector-users.ll index 117795df75f1..f8a47cdaaaeb 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-promote-alloca-vector-users.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-promote-alloca-vector-users.ll @@ -4,34 +4,34 @@ ; CHECK-LABEL: @test_insertelement( ; CHECK: %alloca = alloca i16 -; CHECK-NEXT: insertelement <2 x i16*> undef, i16* %alloca, i32 0 +; CHECK-NEXT: insertelement <2 x i16 addrspace(5)*> undef, i16 addrspace(5)* %alloca, i32 0 define amdgpu_kernel void @test_insertelement() #0 { entry: - %alloca = alloca i16, align 4 - %in = insertelement <2 x i16*> undef, i16* %alloca, i32 0 - store <2 x i16*> %in, <2 x i16*>* undef, align 4 + %alloca = alloca i16, align 4, addrspace(5) + %in = insertelement <2 x i16 addrspace(5)*> undef, i16 addrspace(5)* %alloca, i32 0 + store <2 x i16 addrspace(5)*> %in, <2 x i16 addrspace(5)*>* undef, align 4 ret void } ; CHECK-LABEL: @test_insertvalue( ; CHECK: %alloca = alloca i16 -; CHECK-NEXT: insertvalue { i16* } undef, i16* %alloca, 0 +; CHECK-NEXT: insertvalue { i16 addrspace(5)* } undef, i16 addrspace(5)* %alloca, 0 define amdgpu_kernel void @test_insertvalue() #0 { entry: - %alloca = alloca i16, align 4 - %in = insertvalue { i16* } undef, i16* %alloca, 0 - store { i16* } %in, { i16* }* undef, align 4 + %alloca = alloca i16, align 4, addrspace(5) + %in = insertvalue { i16 addrspace(5)* } undef, i16 addrspace(5)* %alloca, 0 + store { i16 addrspace(5)* } %in, { i16 addrspace(5)* }* undef, align 4 ret void } ; CHECK-LABEL: @test_insertvalue_array( ; CHECK: %alloca = alloca i16 -; CHECK-NEXT: insertvalue [2 x i16*] undef, i16* %alloca, 0 +; CHECK-NEXT: insertvalue [2 x i16 addrspace(5)*] undef, i16 addrspace(5)* %alloca, 0 define amdgpu_kernel void @test_insertvalue_array() #0 { entry: - %alloca = alloca i16, align 4 - %in = insertvalue [2 x i16*] undef, i16* %alloca, 0 - store [2 x i16*] %in, [2 x i16*]* undef, align 4 + %alloca = alloca i16, align 4, addrspace(5) + %in = insertvalue [2 x i16 addrspace(5)*] undef, i16 addrspace(5)* %alloca, 0 + store [2 x i16 addrspace(5)*] %in, [2 x i16 addrspace(5)*]* undef, align 4 ret void }