[OpenMP][FIX] Change globalization alignment to 16

This patch changes the default aligntment from 8 to 16, and encodes this
information in the `__kmpc_alloc_shared` runtime call to communicate it
to the HeapToStack pass. The previous alignment of 8 was not sufficient
for the maximum size of primitive types on 64-bit systems, and needs to
be increaesd. This reduces the amount of space availible in the data
sharing stack, so this implementation will need to be improved later to
include the alignment requirements in the allocation call, and use it
properly in the data sharing stack in the runtime.

Depends on D115888

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D115971
This commit is contained in:
Joseph Huber 2021-12-17 17:00:13 -05:00
parent 38fc89623b
commit 7cdaa5a94e
14 changed files with 541 additions and 532 deletions

View File

@ -1402,10 +1402,14 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
// Allocate space for the variable to be globalized
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::Instruction *VoidPtr =
llvm::CallBase *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
// FIXME: We should use the variables actual alignment as an argument.
VoidPtr->addRetAttr(llvm::Attribute::get(
CGM.getLLVMContext(), llvm::Attribute::Alignment,
CGM.getContext().getTargetInfo().getNewAlign() / 8));
// Cast the void pointer and get the address of the globalized variable.
llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
@ -1438,10 +1442,13 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
// Allocate space for this VLA object to be globalized.
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::Instruction *VoidPtr =
llvm::CallBase *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
VoidPtr->addRetAttr(
llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
CGM.getContext().getTargetInfo().getNewAlign()));
I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
std::pair<llvm::Value *, llvm::Value *>(

View File

@ -58,8 +58,8 @@ int maini1() {
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[B]]) #[[ATTR4:[0-9]+]]
// CHECK1-NEXT: [[CALL1:%.*]] = call i32 @_Z3barv() #[[ATTR4]]
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[B]]) #[[ATTR6:[0-9]+]]
// CHECK1-NEXT: [[CALL1:%.*]] = call i32 @_Z3barv() #[[ATTR6]]
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
// CHECK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
// CHECK1-NEXT: ret void
@ -78,9 +78,9 @@ int maini1() {
// CHECK1-LABEL: define {{[^@]+}}@_Z3barv
// CHECK1-SAME: () #[[ATTR2]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[A:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32*
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR4]]
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR6]]
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[A]], i64 4)
// CHECK1-NEXT: ret i32 [[CALL]]
//

View File

@ -397,9 +397,9 @@ void test_ds(){
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
// CHECK-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK-NEXT: [[A:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32*
// CHECK-NEXT: [[B:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK-NEXT: [[B:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK-NEXT: [[B_ON_STACK:%.*]] = bitcast i8* [[B]] to i32*
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK-NEXT: store i32 10, i32* [[A_ON_STACK]], align 4

View File

@ -3046,7 +3046,7 @@ int main(int argc, char **argv) {
// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32*
// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8
// CHECK4-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i64 40)
// CHECK4-NEXT: [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 40)
// CHECK4-NEXT: [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]*
// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 4
// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4
@ -3377,7 +3377,7 @@ int main(int argc, char **argv) {
// CHECK5-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4
// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4
// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4
// CHECK5-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 40)
// CHECK5-NEXT: [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 40)
// CHECK5-NEXT: [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]*
// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4
@ -3700,7 +3700,7 @@ int main(int argc, char **argv) {
// CHECK6-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4
// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4
// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4
// CHECK6-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 40)
// CHECK6-NEXT: [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 40)
// CHECK6-NEXT: [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]*
// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4

View File

@ -1633,7 +1633,7 @@ int bar(int n){
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK1: user_code.entry:
// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
// CHECK1-NEXT: [[A1:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[A1:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32*
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[A_ON_STACK]], align 4
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -1905,7 +1905,7 @@ int bar(int n){
// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK2: user_code.entry:
// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK2-NEXT: [[A1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[A1:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32*
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[A_ON_STACK]], align 4
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])

View File

@ -469,7 +469,7 @@ int bar(int n){
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
// CHECK-NEXT: [[D:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK-NEXT: [[D:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D]] to i32*
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4

View File

@ -418,7 +418,7 @@ void unreachable_call() {
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca double*, align 8
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
// CHECK1-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[F:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
// CHECK1-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4
// CHECK1-NEXT: store double* [[A]], double** [[A_ADDR]], align 8
@ -802,7 +802,7 @@ void unreachable_call() {
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4
// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
// CHECK2-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[F:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
// CHECK2-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4
// CHECK2-NEXT: store double* [[A]], double** [[A_ADDR]], align 4
@ -1185,7 +1185,7 @@ void unreachable_call() {
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4
// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
// CHECK3-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[F:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
// CHECK3-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4
// CHECK3-NEXT: store double* [[A]], double** [[A_ADDR]], align 4

View File

@ -607,7 +607,7 @@ int bar(int n){
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
// CHECK1-NEXT: [[I:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[I:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[I_ON_STACK:%.*]] = bitcast i8* [[I]] to i32*
// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4
@ -729,7 +729,7 @@ int bar(int n){
// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK2-NEXT: [[I:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[I:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[I_ON_STACK:%.*]] = bitcast i8* [[I]] to i32*
// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4
@ -851,7 +851,7 @@ int bar(int n){
// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK3-NEXT: [[I:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[I:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[I_ON_STACK:%.*]] = bitcast i8* [[I]] to i32*
// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4

View File

@ -18534,7 +18534,7 @@ int bar(int n){
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32*
// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8
// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32*
// CHECK1-NEXT: [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[L2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32*
// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
@ -20128,7 +20128,7 @@ int bar(int n){
// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32*
// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8
// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32*
// CHECK2-NEXT: [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK2-NEXT: [[L2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK2-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32*
// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
@ -21711,7 +21711,7 @@ int bar(int n){
// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4
// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4
// CHECK3-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[L1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32*
// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
@ -23245,7 +23245,7 @@ int bar(int n){
// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4
// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4
// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4
// CHECK4-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK4-NEXT: [[L1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK4-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32*
// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4

View File

@ -9447,7 +9447,7 @@ int bar(int n){
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32*
// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8
// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32*
// CHECK1-NEXT: [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[L2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32*
// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
@ -10525,7 +10525,7 @@ int bar(int n){
// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4
// CHECK2-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4
// CHECK2-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[L1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32*
// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
@ -11563,7 +11563,7 @@ int bar(int n){
// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4
// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4
// CHECK3-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[L1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32*
// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4

View File

@ -903,7 +903,7 @@ int main (int argc, char **argv) {
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK1: user_code.entry:
// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
// CHECK1-NEXT: [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[ARGC1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i32*
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -943,7 +943,7 @@ int main (int argc, char **argv) {
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK1: user_code.entry:
// CHECK1-NEXT: [[TMP1:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8
// CHECK1-NEXT: [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
// CHECK1-NEXT: [[ARGC1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
// CHECK1-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i8***
// CHECK1-NEXT: store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 8
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -983,7 +983,7 @@ int main (int argc, char **argv) {
// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK2: user_code.entry:
// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
// CHECK2-NEXT: [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[ARGC1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i32*
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -1023,7 +1023,7 @@ int main (int argc, char **argv) {
// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK2: user_code.entry:
// CHECK2-NEXT: [[TMP1:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4
// CHECK2-NEXT: [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[ARGC1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i8***
// CHECK2-NEXT: store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 4
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -1070,7 +1070,7 @@ int main (int argc, char **argv) {
// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK3: user_code.entry:
// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV2]], align 4
// CHECK3-NEXT: [[ARGC3:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK3-NEXT: [[ARGC3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK3-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC3]] to i32*
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -1116,7 +1116,7 @@ int main (int argc, char **argv) {
// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK3: user_code.entry:
// CHECK3-NEXT: [[TMP1:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8
// CHECK3-NEXT: [[ARGC2:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
// CHECK3-NEXT: [[ARGC2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
// CHECK3-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC2]] to i8***
// CHECK3-NEXT: store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 8
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -1160,7 +1160,7 @@ int main (int argc, char **argv) {
// CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK4: user_code.entry:
// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
// CHECK4-NEXT: [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK4-NEXT: [[ARGC1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK4-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i32*
// CHECK4-NEXT: store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4
// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -1204,7 +1204,7 @@ int main (int argc, char **argv) {
// CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK4: user_code.entry:
// CHECK4-NEXT: [[TMP1:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4
// CHECK4-NEXT: [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK4-NEXT: [[ARGC1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK4-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i8***
// CHECK4-NEXT: store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 4
// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])

View File

@ -4219,7 +4219,7 @@ int bar(int n){
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK1: user_code.entry:
// CHECK1-NEXT: [[TMP1:%.*]] = load double, double* [[CONV]], align 8
// CHECK1-NEXT: [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
// CHECK1-NEXT: [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
// CHECK1-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
// CHECK1-NEXT: store double [[TMP1]], double* [[E_ON_STACK]], align 8
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -4244,7 +4244,7 @@ int bar(int n){
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
// CHECK1-NEXT: store double* [[E]], double** [[E_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8
// CHECK1-NEXT: [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
// CHECK1-NEXT: [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
// CHECK1-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
// CHECK1-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8
// CHECK1-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8
@ -4521,10 +4521,10 @@ int bar(int n){
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK1: user_code.entry:
// CHECK1-NEXT: [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1
// CHECK1-NEXT: [[C2:%.*]] = call i8* @__kmpc_alloc_shared(i64 1)
// CHECK1-NEXT: [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 1)
// CHECK1-NEXT: store i8 [[TMP1]], i8* [[C2]], align 1
// CHECK1-NEXT: [[TMP2:%.*]] = load float, float* [[CONV1]], align 4
// CHECK1-NEXT: [[D3:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float*
// CHECK1-NEXT: store float [[TMP2]], float* [[D_ON_STACK]], align 4
// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -4553,8 +4553,8 @@ int bar(int n){
// CHECK1-NEXT: store float* [[D]], float** [[D_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8
// CHECK1-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8
// CHECK1-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i64 1)
// CHECK1-NEXT: [[D2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 1)
// CHECK1-NEXT: [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float*
// CHECK1-NEXT: store i8 0, i8* [[C1]], align 1
// CHECK1-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4
@ -5563,7 +5563,7 @@ int bar(int n){
// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
// CHECK2-NEXT: [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i32 8)
// CHECK2-NEXT: [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 8)
// CHECK2-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
// CHECK2-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8
// CHECK2-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8
@ -5840,10 +5840,10 @@ int bar(int n){
// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK2: user_code.entry:
// CHECK2-NEXT: [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1
// CHECK2-NEXT: [[C2:%.*]] = call i8* @__kmpc_alloc_shared(i32 1)
// CHECK2-NEXT: [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
// CHECK2-NEXT: store i8 [[TMP1]], i8* [[C2]], align 1
// CHECK2-NEXT: [[TMP2:%.*]] = load float, float* [[CONV1]], align 4
// CHECK2-NEXT: [[D3:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float*
// CHECK2-NEXT: store float [[TMP2]], float* [[D_ON_STACK]], align 4
// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -5872,8 +5872,8 @@ int bar(int n){
// CHECK2-NEXT: store float* [[D]], float** [[D_ADDR]], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
// CHECK2-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
// CHECK2-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 1)
// CHECK2-NEXT: [[D2:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
// CHECK2-NEXT: [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float*
// CHECK2-NEXT: store i8 0, i8* [[C1]], align 1
// CHECK2-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4
@ -6881,7 +6881,7 @@ int bar(int n){
// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
// CHECK3-NEXT: [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i32 8)
// CHECK3-NEXT: [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 8)
// CHECK3-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
// CHECK3-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8
// CHECK3-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8
@ -7158,10 +7158,10 @@ int bar(int n){
// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK3: user_code.entry:
// CHECK3-NEXT: [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1
// CHECK3-NEXT: [[C2:%.*]] = call i8* @__kmpc_alloc_shared(i32 1)
// CHECK3-NEXT: [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
// CHECK3-NEXT: store i8 [[TMP1]], i8* [[C2]], align 1
// CHECK3-NEXT: [[TMP2:%.*]] = load float, float* [[CONV1]], align 4
// CHECK3-NEXT: [[D3:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float*
// CHECK3-NEXT: store float [[TMP2]], float* [[D_ON_STACK]], align 4
// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -7190,8 +7190,8 @@ int bar(int n){
// CHECK3-NEXT: store float* [[D]], float** [[D_ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
// CHECK3-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
// CHECK3-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 1)
// CHECK3-NEXT: [[D2:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
// CHECK3-NEXT: [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float*
// CHECK3-NEXT: store i8 0, i8* [[C1]], align 1
// CHECK3-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4

View File

@ -26,7 +26,9 @@ using namespace _OMP;
///{
/// Add worst-case padding so that future allocations are properly aligned.
constexpr const uint32_t Alignment = 8;
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
/// passed in as an argument and the stack rewritten to support it.
constexpr const uint32_t Alignment = 16;
/// External symbol to access dynamic shared memory.
extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));