diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index d5239a545bfd..178e1c0d1662 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -48,7 +48,8 @@ add_llvm_target(RISCVCodeGen GISel/RISCVLegalizerInfo.cpp GISel/RISCVRegisterBankInfo.cpp VentusPrintfRuntimeBinding.cpp - + VentusAlwaysInlinePass.cpp + LINK_COMPONENTS Analysis AsmPrinter diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 37fb84438dc3..6b5ee46bbf00 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -13,7 +13,6 @@ #ifndef LLVM_LIB_TARGET_RISCV_RISCV_H #define LLVM_LIB_TARGET_RISCV_RISCV_H - #include "llvm/Pass.h" #include "MCTargetDesc/RISCVBaseInfo.h" #include "llvm/Target/TargetMachine.h" @@ -84,15 +83,26 @@ InstructionSelector *createRISCVInstructionSelector(const RISCVTargetMachine &, RISCVRegisterBankInfo &); ModulePass *createVentusPrintfRuntimeBinding(); -void initializeVentusPrintfRuntimeBindingPass(PassRegistry&); +void initializeVentusPrintfRuntimeBindingPass(PassRegistry &); extern char &VentusPrintfRuntimeBindingID; struct VentusPrintfRuntimeBindingPass : PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; -} +ModulePass *createVentusAlwaysInlinePass(bool GlobalOpt = true); +void initializeVentusAlwaysInlinePass(PassRegistry &Registry); + +struct VentusAlwaysInlinePass : public PassInfoMixin { + VentusAlwaysInlinePass(bool GlobalOpt = true) : GlobalOpt(GlobalOpt) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + +private: + bool GlobalOpt; +}; + +} // namespace llvm /// OpenCL uses address spaces to differentiate between /// various memory regions on the hardware. On the CPU @@ -101,19 +111,19 @@ struct VentusPrintfRuntimeBindingPass /// a separate piece of memory that is unique from other /// memory locations. namespace RISCVAS { - enum : unsigned { - // The maximum value for flat, generic, local, private, constant and region. - MAX_VENTUS_ADDRESS = 5, +enum : unsigned { + // The maximum value for flat, generic, local, private, constant and region. + MAX_VENTUS_ADDRESS = 5, - FLAT_ADDRESS = 0, ///< Address space for flat memory. - GLOBAL_ADDRESS = 1, ///< Address space for global memory - CONSTANT_ADDRESS = 4, ///< Address space for constant memory - LOCAL_ADDRESS = 3, ///< Address space for local memory. - PRIVATE_ADDRESS = 5, ///< Address space for private memory. + FLAT_ADDRESS = 0, ///< Address space for flat memory. + GLOBAL_ADDRESS = 1, ///< Address space for global memory + CONSTANT_ADDRESS = 4, ///< Address space for constant memory + LOCAL_ADDRESS = 3, ///< Address space for local memory. + PRIVATE_ADDRESS = 5, ///< Address space for private memory. - // Some places use this if the address space can't be determined. - UNKNOWN_ADDRESS_SPACE = ~0u, - }; + // Some places use this if the address space can't be determined. + UNKNOWN_ADDRESS_SPACE = ~0u, +}; } /// Because there are two stacks in ventus, we need to add a VGPRSpill according diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 13648559e861..5625002016f6 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -36,9 +36,11 @@ #include "llvm/Passes/PassBuilder.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/Scalar.h" #include + using namespace llvm; static cl::opt EnableRedundantCopyElimination( @@ -56,6 +58,11 @@ static cl::opt cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); +// Option to inline all early. +static cl::opt EarlyInlineAll("ventus-early-inline-all", + cl::desc("Inline all functions early"), + cl::init(false), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { RegisterTargetMachine X(getTheRISCV32Target()); RegisterTargetMachine Y(getTheRISCV64Target()); @@ -68,6 +75,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVPreRAExpandPseudoPass(*PR); initializeRISCVExpandPseudoPass(*PR); initializeVentusPrintfRuntimeBindingPass(*PR); + initializeVentusAlwaysInlinePass(*PR); } static StringRef computeDataLayout(const Triple &TT, StringRef CPU) { @@ -75,7 +83,7 @@ static StringRef computeDataLayout(const Triple &TT, StringRef CPU) { // return "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256" // "-v256:256-v512:512-v1024:1024-n32:64-S128-A5-G1"; bool IsRV32 = TT.isRISCV32(); - if(!IsRV32) + if (!IsRV32) return "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128-A5-G1"; assert(TT.isArch32Bit() && "only RV32 and RV64 are currently supported"); return "e-m:e-p:32:32-i64:64-n32-S128-A5-G1"; @@ -145,12 +153,19 @@ void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(VentusPrintfRuntimeBindingPass()); return true; } + if (PassName == "ventus-always-inline") { + PM.addPass(VentusAlwaysInlinePass()); + return true; + } return false; }); PB.registerPipelineEarlySimplificationEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { PM.addPass(VentusPrintfRuntimeBindingPass()); + + if (EarlyInlineAll) + PM.addPass(VentusAlwaysInlinePass()); }); } diff --git a/llvm/lib/Target/RISCV/VentusAlwaysInlinePass.cpp b/llvm/lib/Target/RISCV/VentusAlwaysInlinePass.cpp new file mode 100644 index 000000000000..e4c57ca2b695 --- /dev/null +++ b/llvm/lib/Target/RISCV/VentusAlwaysInlinePass.cpp @@ -0,0 +1,156 @@ +//===-- VentusAlwaysInlinePass.cpp - Force Function Inlining --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass marks functions for inlining in Ventus code. Specifically: +/// 1. Functions accessing LOCAL memory (addrspace(3)) are marked as always_inline +/// 2. Under stress-calls mode, non-kernel functions are marked as noinline +/// 3. Otherwise, non-kernel functions are marked as always_inline +/// 4. Function aliases are replaced with their targets and optionally removed +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVSubtarget.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define VENTUS_ALWAYS_INLINE "Ventus Inline All Functions" +#define DEBUG_TYPE "ventus-always-inline" + +namespace { + +static cl::opt StressCalls("ventus-stress-function-calls", cl::Hidden, + cl::desc("Force all functions to be noinline"), + cl::init(false)); + +class VentusAlwaysInline : public ModulePass { + bool GlobalOpt; + +public: + static char ID; + + VentusAlwaysInline(bool GlobalOpt = false) + : ModulePass(ID), GlobalOpt(GlobalOpt) {} + + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + StringRef getPassName() const override { return VENTUS_ALWAYS_INLINE; } +}; + +} // End anonymous namespace + +char VentusAlwaysInline::ID = 0; + +static void +recursivelyVisitUsers(GlobalValue &GV, + SmallPtrSetImpl &FuncsToAlwaysInline) { + SmallVector Stack(GV.users()); + SmallPtrSet Visited; + + while (!Stack.empty()) { + User *U = Stack.pop_back_val(); + if (!Visited.insert(U).second) + continue; + + if (Instruction *I = dyn_cast(U)) { + Function *F = I->getParent()->getParent(); + if (F->getCallingConv() != CallingConv::VENTUS_KERNEL) { + F->removeFnAttr(Attribute::NoInline); + FuncsToAlwaysInline.insert(F); + Stack.push_back(F); + } + continue; + } + + append_range(Stack, U->users()); + } +} + +static bool alwaysInlineImpl(Module &M, bool GlobalOpt) { + std::vector AliasesToRemove; + + SmallPtrSet FuncsToAlwaysInline; + SmallPtrSet FuncsToNoInline; + + Triple TT(M.getTargetTriple()); + + for (GlobalAlias &A : M.aliases()) { + if (Function *F = dyn_cast(A.getAliasee())) { + if (TT.getArch() == Triple::riscv32 && + A.getLinkage() != GlobalValue::InternalLinkage) + continue; + A.replaceAllUsesWith(F); + AliasesToRemove.push_back(&A); + } + // FIXME: If the aliasee isn't a function, it's some kind of constant expr + // cast that won't be inlined through. + } + + if (GlobalOpt) { + for (GlobalAlias *A : AliasesToRemove) { + A->eraseFromParent(); + } + } + + for (GlobalVariable &GV : M.globals()) { + unsigned AS = GV.getAddressSpace(); + if (AS == RISCVAS::LOCAL_ADDRESS) { + recursivelyVisitUsers(GV, FuncsToAlwaysInline); + } + } + + auto IncompatAttr = + StressCalls ? Attribute::AlwaysInline : Attribute::NoInline; + + for (Function &F : M) { + if (!F.isDeclaration() && !F.use_empty() && + !F.hasFnAttribute(IncompatAttr) && + F.getCallingConv() != CallingConv::VENTUS_KERNEL) { + if (StressCalls) { + if (!FuncsToAlwaysInline.count(&F)) + FuncsToNoInline.insert(&F); + } else + FuncsToAlwaysInline.insert(&F); + } + } + + for (Function *F : FuncsToAlwaysInline) + F->addFnAttr(Attribute::AlwaysInline); + + for (Function *F : FuncsToNoInline) + F->addFnAttr(Attribute::NoInline); + + return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty(); +} + +bool VentusAlwaysInline::runOnModule(Module &M) { + return alwaysInlineImpl(M, GlobalOpt); +} + +INITIALIZE_PASS(VentusAlwaysInline, "ventus-always-inline", + VENTUS_ALWAYS_INLINE, false, false) + +namespace llvm { +ModulePass *createVentusAlwaysInlinePass(bool GlobalOpt) { + return new VentusAlwaysInline(GlobalOpt); +} + +PreservedAnalyses VentusAlwaysInlinePass::run(Module &M, + ModuleAnalysisManager &AM) { + alwaysInlineImpl(M, GlobalOpt); + return PreservedAnalyses::all(); +} +} // end namespace llvm diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/force-alwaysinline-lds-global-address.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/force-alwaysinline-lds-global-address.ll new file mode 100644 index 000000000000..2d4ff0034b9e --- /dev/null +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/force-alwaysinline-lds-global-address.ll @@ -0,0 +1,89 @@ +; RUN: opt -S -mtriple=riscv32 -passes=ventus-always-inline %s | FileCheck --check-prefix=ALL %s +; RUN: opt -S -mtriple=riscv32 -ventus-stress-function-calls -passes=ventus-always-inline %s | FileCheck --check-prefix=ALL %s + +@local0 = addrspace(3) global i32 undef, align 4 +@local1 = addrspace(3) global [512 x i32] undef, align 4 +@nested.local.address = addrspace(1) global ptr addrspace(3) @local0, align 4 + +@alias.local0 = alias i32, ptr addrspace(3) @local0 +@local.cycle = addrspace(3) global i32 ptrtoint (ptr addrspace(3) @local.cycle to i32), align 4 + + +; ALL-LABEL: define i32 @load_local_simple() #0 { +define i32 @load_local_simple() { + %load = load i32, ptr addrspace(3) @local0, align 4 + ret i32 %load +} + +; ALL-LABEL: define i32 @load_local_const_gep() #0 { +define i32 @load_local_const_gep() { + %load = load i32, ptr addrspace(3) getelementptr inbounds ([512 x i32], ptr addrspace(3) @local1, i64 0, i64 4), align 4 + ret i32 %load +} + +; ALL-LABEL: define i32 @load_local_var_gep(i32 %idx) #0 { +define i32 @load_local_var_gep(i32 %idx) { + %gep = getelementptr inbounds [512 x i32], ptr addrspace(3) @local1, i32 0, i32 %idx + %load = load i32, ptr addrspace(3) %gep, align 4 + ret i32 %load +} + +; ALL-LABEL: define ptr addrspace(3) @load_nested_address(i32 %idx) #0 { +define ptr addrspace(3) @load_nested_address(i32 %idx) { + %load = load ptr addrspace(3), ptr addrspace(1) @nested.local.address, align 4 + ret ptr addrspace(3) %load +} + +; ALL-LABEL: define i32 @load_local_alias() #0 { +define i32 @load_local_alias() { + %load = load i32, ptr addrspace(3) @alias.local0, align 4 + ret i32 %load +} + +; ALL-LABEL: define i32 @load_local_cycle() #0 { +define i32 @load_local_cycle() { + %load = load i32, ptr addrspace(3) @local.cycle, align 4 + ret i32 %load +} + +; ALL-LABEL: define i1 @icmp_local_address() #0 { +define i1 @icmp_local_address() { + ret i1 icmp eq (ptr addrspace(3) @local0, ptr addrspace(3) null) +} + +; ALL-LABEL: define i32 @transitive_call() #0 { +define i32 @transitive_call() { + %call = call i32 @load_local_simple() + ret i32 %call +} + +; ALL-LABEL: define i32 @recursive_call_local(i32 %arg0) #0 { +define i32 @recursive_call_local(i32 %arg0) { + %load = load i32, ptr addrspace(3) @local0, align 4 + %add = add i32 %arg0, %load + %call = call i32 @recursive_call_local(i32 %add) + ret i32 %call +} + +; ALL-LABEL: define i32 @load_local_simple_noinline() #0 { +define i32 @load_local_simple_noinline() noinline { + %load = load i32, ptr addrspace(3) @local0, align 4 + ret i32 %load +} + +; ALL-LABEL: define i32 @recursive_call_local_noinline(i32 %arg0) #0 { +define i32 @recursive_call_local_noinline(i32 %arg0) noinline { + %load = load i32, ptr addrspace(3) @local0, align 4 + %add = add i32 %arg0, %load + %call = call i32 @recursive_call_local(i32 %add) + ret i32 %call +} + +; ALL-LABEL: define ventus_kernel void @kernel_with_local_access( +define ventus_kernel void @kernel_with_local_access(ptr addrspace(1) %out) { + %load = load i32, ptr addrspace(3) @local0, align 4 + store i32 %load, ptr addrspace(1) %out, align 4 + ret void +} + +; ALL: attributes #0 = { alwaysinline } diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/internal-alias-removal.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/internal-alias-removal.ll new file mode 100644 index 000000000000..343ad0cf2909 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/internal-alias-removal.ll @@ -0,0 +1,18 @@ +; RUN: opt -S -mtriple=riscv32 -passes=ventus-always-inline %s | FileCheck %s + +@internal_alias = internal alias i32 (i32), ptr @original_function +@public_alias = alias i32 (i32), ptr @original_function + +define i32 @original_function(i32 %x) { + %result = add i32 %x, 42 + ret i32 %result +} + +define i32 @call_public() { + %res = call i32 @public_alias(i32 7) + ret i32 %res +} + +; CHECK: define i32 @original_function +; CHECK-NOT: @internal_alias +; CHECK: @public_alias diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/ventus-always-inline.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/ventus-always-inline.ll new file mode 100644 index 000000000000..a7c6c93272fc --- /dev/null +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/ventus-always-inline.ll @@ -0,0 +1,33 @@ +; RUN: opt -mtriple=riscv32 -O1 -S -inline-threshold=1 -ventus-early-inline-all %s | FileCheck %s + +@c_alias = dso_local alias i32 (i32), ptr @callee + +define dso_local i32 @callee(i32 %x) { +entry: + %mul1 = mul i32 %x, %x + %mul2 = mul i32 %mul1, %x + %mul3 = mul i32 %mul1, %mul2 + %mul4 = mul i32 %mul3, %mul2 + %mul5 = mul i32 %mul4, %mul3 + ret i32 %mul5 +} + +; CHECK-LABEL: @caller +; CHECK: mul i32 +; CHECK-NOT: call i32 + +define ventus_kernel void @caller(i32 %x) { +entry: + %res = call i32 @callee(i32 %x) + store volatile i32 %res, ptr addrspace(1) undef + ret void +} + +; CHECK-LABEL: @alias_caller( +; CHECK-NOT: call +define ventus_kernel void @alias_caller(i32 %x) { +entry: + %res = call i32 @c_alias(i32 %x) + store volatile i32 %res, ptr addrspace(1) undef + ret void +}