[X86] AMX programming model.

This patch implements amx programming model that discussed in llvm-dev
 (http://lists.llvm.org/pipermail/llvm-dev/2020-August/144302.html).
 Thank Hal for the good suggestion in the RA. The fast RA is not in the patch yet.
 This patch implemeted 7 components.

1. The c interface to end user.
2. The AMX intrinsics in LLVM IR.
3. Transform load/store <256 x i32> to AMX intrinsics or split the
   type into two <128 x i32>.
4. The Lowering from AMX intrinsics to AMX pseudo instruction.
5. Insert psuedo ldtilecfg and build the def-use between ldtilecfg to amx
   intruction.
6. The register allocation for tile register.
7. Morph AMX pseudo instruction to AMX real instruction.

Change-Id: I935e1080916ffcb72af54c2c83faa8b2e97d5cb0

Differential Revision: https://reviews.llvm.org/D87981
This commit is contained in:
Luo, Yuanke 2020-09-06 10:17:22 +08:00
parent 09f717b929
commit f80b29878b
39 changed files with 1856 additions and 44 deletions

View File

@ -100,6 +100,10 @@ TARGET_BUILTIN(__builtin_ia32_stui, "v", "n", "uintr")
TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr")
TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr")
// AMX internal builtin
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile")
// AMX
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")

View File

@ -15,8 +15,8 @@
#define __AMXINTRIN_H
#ifdef __x86_64__
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
#define __DEFAULT_FN_ATTRS_TILE \
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
/// Load tile configuration from a 64-byte memory location specified by
/// "mem_addr". The tile configuration includes the tile type palette, the
@ -31,9 +31,8 @@
///
/// \param __config
/// A pointer to 512-bits configuration
static __inline__ void __DEFAULT_FN_ATTRS
_tile_loadconfig(const void *__config)
{
static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_loadconfig(const void *__config) {
__builtin_ia32_tile_loadconfig(__config);
}
@ -48,9 +47,8 @@ _tile_loadconfig(const void *__config)
///
/// \param __config
/// A pointer to 512-bits configuration
static __inline__ void __DEFAULT_FN_ATTRS
_tile_storeconfig(void *__config)
{
static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_storeconfig(void *__config) {
__builtin_ia32_tile_storeconfig(__config);
}
@ -60,9 +58,7 @@ _tile_storeconfig(void *__config)
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
static __inline__ void __DEFAULT_FN_ATTRS
_tile_release(void)
{
static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
__builtin_ia32_tilerelease();
}
@ -80,8 +76,9 @@ _tile_release(void)
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
#define _tile_loadd(dst, base, stride) \
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
#define _tile_loadd(dst, base, stride) \
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
(__SIZE_TYPE__)(stride))
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst" using the tile configuration previously configured
@ -99,8 +96,9 @@ _tile_release(void)
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
#define _tile_stream_loadd(dst, base, stride) \
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
#define _tile_stream_loadd(dst, base, stride) \
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
(__SIZE_TYPE__)(stride))
/// Store the tile specified by "src" to memory specifieid by "base" address and
/// "stride" using the tile configuration previously configured via
@ -116,7 +114,7 @@ _tile_release(void)
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be stored in memory.
#define _tile_stored(dst, base, stride) \
#define _tile_stored(dst, base, stride) \
__builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
/// Zero the tile specified by "tdest".
@ -145,7 +143,8 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
#define _tile_dpbssd(dst, src0, src1) \
__builtin_ia32_tdpbssd((dst), (src0), (src1))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
@ -163,7 +162,8 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
#define _tile_dpbsud(dst, src0, src1) \
__builtin_ia32_tdpbsud((dst), (src0), (src1))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
@ -181,7 +181,8 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
#define _tile_dpbusd(dst, src0, src1) \
__builtin_ia32_tdpbusd((dst), (src0), (src1))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
@ -199,7 +200,8 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
#define _tile_dpbuud(dst, src0, src1) \
__builtin_ia32_tdpbuud((dst), (src0), (src1))
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
@ -216,10 +218,56 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbf16ps(dst, src0, src1) \
#define _tile_dpbf16ps(dst, src0, src1) \
__builtin_ia32_tdpbf16ps((dst), (src0), (src1))
#undef __DEFAULT_FN_ATTRS
#define __DEFAULT_FN_ATTRS_INT8 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
__SIZE_TYPE__ stride) {
return __builtin_ia32_tileloadd64_internal(m, n, base,
(__SIZE_TYPE__)(stride));
}
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
}
static __inline__ void __DEFAULT_FN_ATTRS_INT8
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
__SIZE_TYPE__ stride, _tile1024i tile) {
return __builtin_ia32_tilestored64_internal(m, n, base,
(__SIZE_TYPE__)(stride), tile);
}
typedef struct __tile1024i_str {
const unsigned short row;
const unsigned short col;
_tile1024i tile;
} __tile1024i;
__DEFAULT_FN_ATTRS_INT8
static void __tile_loadd(__tile1024i *dst, const void *base,
__SIZE_TYPE__ stride) {
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
}
__DEFAULT_FN_ATTRS_INT8
static void __tile_dpbsud(__tile1024i *dst, __tile1024i src1,
__tile1024i src2) {
dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
src1.tile, src2.tile);
}
__DEFAULT_FN_ATTRS_INT8
static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
}
#endif /* __x86_64__ */
#endif /* __AMXINTRIN_H */

View File

@ -0,0 +1,51 @@
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f -target-feature +amx-int8 \
// RUN: -target-feature +amx-bf16 -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
#include <immintrin.h>
char buf[1024];
#define STRIDE 32
char buf2[1024];
// This is an example code and integration test.
void test_api(int cond, short row, short col) {
//CHECK-LABEL: @test_api
//CHECK: call <256 x i32> @llvm.x86.tileloadd64.internal
//CHECK: call <256 x i32> @llvm.x86.tdpbssd.internal
//CHECK: call void @llvm.x86.tilestored64.internal
__tile1024i a = {row, 8};
__tile1024i b = {8, col};
__tile1024i c = {row, col};
if (cond) {
__tile_loadd(&a, buf, STRIDE);
__tile_loadd(&b, buf, STRIDE);
__tile_loadd(&c, buf, STRIDE);
} else {
__tile_loadd(&a, buf2, STRIDE);
__tile_loadd(&b, buf2, STRIDE);
__tile_loadd(&c, buf2, STRIDE);
}
__tile_dpbsud(&c, a, b);
__tile_stored(buf, STRIDE, c);
}
void test_tile_loadd(short row, short col) {
//CHECK-LABEL: @test_tile_loadd
//CHECK: call <256 x i32> @llvm.x86.tileloadd64.internal
__tile1024i a = {row, col};
__tile_loadd(&a, buf, STRIDE);
}
void test_tile_dpbsud(__tile1024i a, __tile1024i b, __tile1024i c) {
//CHECK-LABEL: @test_tile_dpbsud
//CHECK: call <256 x i32> @llvm.x86.tdpbssd.internal
__tile_dpbsud(&c, a, b);
}
void test_tile_stored(__tile1024i c) {
//CHECK-LABEL: @test_tile_stored
//CHECK: call void @llvm.x86.tilestored64.internal
__tile_stored(buf, STRIDE, c);
}

View File

@ -104,6 +104,9 @@ public:
void verify(LiveVirtRegBitSet& VisitedVRegs);
#endif
// Get any virtual register that is assign to this physical unit
LiveInterval *getOneVReg() const;
/// Query interferences between a single live virtual register and a live
/// interval union.
class Query {

View File

@ -153,6 +153,8 @@ public:
/// Directly access the live interval unions per regunit.
/// This returns an array indexed by the regunit number.
LiveIntervalUnion *getLiveUnions() { return &Matrix[0]; }
Register getOneVReg(unsigned PhysReg) const;
};
} // end namespace llvm

View File

@ -485,6 +485,10 @@ namespace llvm {
/// The pass fixups statepoint machine instruction to replace usage of
/// caller saved registers with stack slots.
extern char &FixupStatepointCallerSavedID;
/// The pass transform load/store <256 x i32> to AMX load/store intrinsics
/// or split the data to two <128 x i32>.
FunctionPass *createX86LowerAMXTypePass();
} // End llvm namespace
#endif

View File

@ -0,0 +1,107 @@
//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file Shape utility for AMX.
/// AMX hardware requires to config the shape of tile data register before use.
/// The 2D shape includes row and column. In AMX intrinsics interface the shape
/// is passed as 1st and 2nd parameter and they are lowered as the 1st and 2nd
/// machine operand of AMX pseudo instructions. ShapeT class is to facilitate
/// tile config and register allocator. The row and column are machine operand
/// of AMX pseudo instructions.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CODEGEN_TILESHAPEINFO_H
#define LLVM_CODEGEN_TILESHAPEINFO_H
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Register.h"
#include <utility>
using namespace llvm;
namespace llvm {
class ShapeT {
public:
ShapeT(MachineOperand *Row, MachineOperand *Col,
const MachineRegisterInfo *MRI = nullptr)
: Row(Row), Col(Col) {
if (MRI)
deduceImm(MRI);
}
ShapeT()
: Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
ColImm(InvalidImmShape) {}
bool operator==(const ShapeT &Shape) {
MachineOperand *R = Shape.Row;
MachineOperand *C = Shape.Col;
if (!R || !C)
return false;
if (!Row || !Col)
return false;
if (Row->getReg() == R->getReg() && Col->getReg() == C->getReg())
return true;
if ((RowImm != InvalidImmShape) && (ColImm != InvalidImmShape))
return RowImm == Shape.getRowImm() && ColImm == Shape.getColImm();
return false;
}
bool operator!=(const ShapeT &Shape) { return !(*this == Shape); }
ShapeT &operator=(const ShapeT &RHS) {
Row = RHS.Row;
Col = RHS.Col;
RowImm = RHS.RowImm;
ColImm = RHS.ColImm;
return *this;
}
MachineOperand *getRow() const { return Row; }
MachineOperand *getCol() const { return Col; }
int64_t getRowImm() const { return RowImm; }
int64_t getColImm() const { return ColImm; }
bool isValid() { return (Row != nullptr) && (Col != nullptr); }
void deduceImm(const MachineRegisterInfo *MRI) {
// All def must be the same value, otherwise it is invalid MIs.
// Find the immediate.
// TODO copy propagation.
auto GetImm = [&](Register Reg) {
int64_t Imm = InvalidImmShape;
for (const MachineOperand &DefMO : MRI->def_operands(Reg)) {
const auto *MI = DefMO.getParent();
if (MI->isMoveImmediate()) {
Imm = MI->getOperand(1).getImm();
break;
}
}
return Imm;
};
RowImm = GetImm(Row->getReg());
ColImm = GetImm(Col->getReg());
}
private:
static constexpr int64_t InvalidImmShape = -1;
MachineOperand *Row;
MachineOperand *Col;
int64_t RowImm;
int64_t ColImm;
};
} // namespace llvm
#endif

View File

@ -19,6 +19,7 @@
#include "llvm/ADT/IndexedMap.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TileShapeInfo.h"
#include "llvm/Pass.h"
#include <cassert>
@ -60,6 +61,10 @@ class TargetInstrInfo;
/// mapping.
IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2SplitMap;
/// Virt2ShapeMap - For X86 AMX register whose register is bound shape
/// information.
DenseMap<unsigned, ShapeT> Virt2ShapeMap;
/// createSpillSlot - Allocate a spill slot for RC from MFI.
unsigned createSpillSlot(const TargetRegisterClass *RC);
@ -107,6 +112,21 @@ class TargetInstrInfo;
/// the specified physical register
void assignVirt2Phys(Register virtReg, MCPhysReg physReg);
bool isShapeMapEmpty() const { return Virt2ShapeMap.empty(); }
bool hasShape(Register virtReg) const {
return getShape(virtReg).isValid();
}
ShapeT getShape(Register virtReg) const {
assert(virtReg.isVirtual());
return Virt2ShapeMap.lookup(virtReg);
}
void assignVirt2Shape(Register virtReg, ShapeT shape) {
Virt2ShapeMap[virtReg.id()] = shape;
}
/// clears the specified virtual register's, physical
/// register mapping
void clearVirt(Register virtReg) {
@ -133,6 +153,9 @@ class TargetInstrInfo;
/// records virtReg is a split live interval from SReg.
void setIsSplitFromReg(Register virtReg, unsigned SReg) {
Virt2SplitMap[virtReg.id()] = SReg;
if (hasShape(SReg)) {
Virt2ShapeMap[virtReg.id()] = getShape(SReg);
}
}
/// returns the live interval virtReg is split from.

View File

@ -292,6 +292,7 @@ def llvm_v8i32_ty : LLVMType<v8i32>; // 8 x i32
def llvm_v16i32_ty : LLVMType<v16i32>; // 16 x i32
def llvm_v32i32_ty : LLVMType<v32i32>; // 32 x i32
def llvm_v64i32_ty : LLVMType<v64i32>; // 64 x i32
def llvm_v256i32_ty : LLVMType<v256i32>; //256 x i32
def llvm_v1i64_ty : LLVMType<v1i64>; // 1 x i64
def llvm_v2i64_ty : LLVMType<v2i64>; // 2 x i64

View File

@ -5056,4 +5056,20 @@ let TargetPrefix = "x86" in {
Intrinsic<[llvm_i8_ty], [], []>;
def int_x86_senduipi : GCCBuiltin<"__builtin_ia32_senduipi">,
Intrinsic<[], [llvm_i64_ty], []>;
// AMX - internal intrinsics
def int_x86_tileloadd64_internal :
GCCBuiltin<"__builtin_ia32_tileloadd64_internal">,
Intrinsic<[llvm_v256i32_ty],
[llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
[]>;
def int_x86_tdpbssd_internal :
GCCBuiltin<"__builtin_ia32_tdpbssd_internal">,
Intrinsic<[llvm_v256i32_ty],
[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
llvm_v256i32_ty, llvm_v256i32_ty,
llvm_v256i32_ty], []>;
def int_x86_tilestored64_internal :
GCCBuiltin<"__builtin_ia32_tilestored64_internal">,
Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty,
llvm_i64_ty, llvm_v256i32_ty], []>;
}

View File

@ -1558,4 +1558,6 @@ void HoistSpillHelper::LRE_DidCloneVirtReg(Register New, Register Old) {
VRM.assignVirt2StackSlot(New, VRM.getStackSlot(Old));
else
llvm_unreachable("VReg should be assigned either physreg or stackslot");
if (VRM.hasShape(Old))
VRM.assignVirt2Shape(New, VRM.getShape(Old));
}

View File

@ -99,6 +99,16 @@ void LiveIntervalUnion::verify(LiveVirtRegBitSet& VisitedVRegs) {
}
#endif //!NDEBUG
LiveInterval *LiveIntervalUnion::getOneVReg() const {
if (empty())
return nullptr;
for (LiveSegments::const_iterator SI = Segments.begin(); SI.valid(); ++SI) {
// return the first valid live interval
return SI.value();
}
return nullptr;
}
// Scan the vector of interfering virtual registers in this union. Assume it's
// quite small.
bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const {

View File

@ -221,3 +221,13 @@ bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End,
}
return false;
}
Register LiveRegMatrix::getOneVReg(unsigned PhysReg) const {
LiveInterval *VRegInterval = nullptr;
for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) {
if ((VRegInterval = Matrix[*Unit].getOneVReg()))
return VRegInterval->reg();
}
return MCRegister::NoRegister;
}

View File

@ -68,6 +68,7 @@ bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
Virt2PhysMap.clear();
Virt2StackSlotMap.clear();
Virt2SplitMap.clear();
Virt2ShapeMap.clear();
grow();
return false;

View File

@ -32,6 +32,9 @@ set(sources
X86CmovConversion.cpp
X86DomainReassignment.cpp
X86DiscriminateMemOps.cpp
X86LowerAMXType.cpp
X86TileConfig.cpp
X86PreTileConfig.cpp
X86ExpandPseudo.cpp
X86FastISel.cpp
X86FixupBWInsts.cpp

View File

@ -76,6 +76,10 @@ FunctionPass *createX86FlagsCopyLoweringPass();
/// Return a pass that expands WinAlloca pseudo-instructions.
FunctionPass *createX86WinAllocaExpander();
FunctionPass *createX86TileConfigPass();
FunctionPass *createX86PreTileConfigPass();
/// Return a pass that inserts int3 at the end of the function if it ends with a
/// CALL instruction. The pass does the same for each funclet as well. This
/// ensures that the open interval of function start and end PCs contains all
@ -162,6 +166,9 @@ void initializeX86OptimizeLEAPassPass(PassRegistry &);
void initializeX86PartialReductionPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
void initializeX86PreTileConfigPass(PassRegistry &);
void initializeX86TileConfigPass(PassRegistry &);
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
namespace X86AS {
enum : unsigned {

View File

@ -461,6 +461,39 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case TargetOpcode::ICALL_BRANCH_FUNNEL:
ExpandICallBranchFunnel(&MBB, MBBI);
return true;
case X86::PLDTILECFG: {
MI.RemoveOperand(0);
MI.setDesc(TII->get(X86::LDTILECFG));
return true;
}
case X86::PSTTILECFG: {
MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
MI.setDesc(TII->get(X86::STTILECFG));
return true;
}
case X86::PTILELOADDV: {
MI.RemoveOperand(8); // Remove $tmmcfg
for (unsigned i = 2; i > 0; --i)
MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TILELOADD));
return true;
}
case X86::PTDPBSSDV: {
MI.RemoveOperand(7); // Remove $tmmcfg
MI.untieRegOperand(4);
for (unsigned i = 3; i > 0; --i)
MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TDPBSSD));
MI.tieOperands(0, 1);
return true;
}
case X86::PTILESTOREDV: {
MI.RemoveOperand(8); // Remove $tmmcfg
for (int i = 1; i >= 0; --i)
MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TILESTORED));
return true;
}
}
llvm_unreachable("Previous switch has a fallthrough?");
}

View File

@ -2091,6 +2091,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
}
}
// Emit tilerelease for AMX kernel.
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (!MRI.reg_nodbg_empty(X86::TMMCFG))
BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
}
StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,

View File

@ -4572,6 +4572,49 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, Res);
return;
}
case Intrinsic::x86_tileloadd64_internal: {
if (!Subtarget->hasAMXTILE())
break;
unsigned Opc = X86::PTILELOADDV;
// _tile_loadd_internal(row, col, buf, STRIDE)
SDValue Base = Node->getOperand(4);
SDValue Scale = getI8Imm(1, dl);
SDValue Index = Node->getOperand(5);
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
SDValue Chain = Node->getOperand(0);
MachineSDNode *CNode;
SDValue Ops[] = {Node->getOperand(2),
Node->getOperand(3),
Base,
Scale,
Index,
Disp,
Segment,
CFG,
Chain};
CNode = CurDAG->getMachineNode(Opc, dl, {MVT::v256i32, MVT::Other}, Ops);
ReplaceNode(Node, CNode);
return;
}
case Intrinsic::x86_tdpbssd_internal: {
if (!Subtarget->hasAMXTILE())
break;
unsigned Opc = X86::PTDPBSSDV;
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
SDValue Ops[] = {Node->getOperand(2),
Node->getOperand(3),
Node->getOperand(4),
Node->getOperand(5),
Node->getOperand(6),
Node->getOperand(7),
CFG};
MachineSDNode *CNode =
CurDAG->getMachineNode(Opc, dl, {MVT::v256i32, MVT::Other}, Ops);
ReplaceNode(Node, CNode);
return;
}
}
break;
}
@ -4629,6 +4672,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
}
case Intrinsic::x86_tilestored64_internal: {
unsigned Opc = X86::PTILESTOREDV;
// _tile_stored_internal(row, col, buf, STRIDE, c)
SDValue Base = Node->getOperand(4);
SDValue Scale = getI8Imm(1, dl);
SDValue Index = Node->getOperand(5);
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
SDValue Chain = Node->getOperand(0);
MachineSDNode *CNode;
SDValue Ops[] = {Node->getOperand(2),
Node->getOperand(3),
Base,
Scale,
Index,
Disp,
Segment,
Node->getOperand(6),
CFG,
Chain};
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
ReplaceNode(Node, CNode);
return;
}
case Intrinsic::x86_tileloadd64:
case Intrinsic::x86_tileloaddt164:
case Intrinsic::x86_tilestored64: {

View File

@ -1897,6 +1897,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
}
if (Subtarget.hasAMXTILE()) {
addRegisterClass(MVT::v256i32, &X86::TILERegClass);
}
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@ -5341,6 +5345,12 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
// width.
if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
return false;
// Don't merge to x86 amx tile, as we only map MVT::v256i32
// to x86 amx tile on amx intrinsics.
if (MemVT == MVT::v256i32)
return false;
return true;
}

View File

@ -16,17 +16,21 @@
let Predicates = [HasAMXTILE, In64BitMode] in {
let SchedRW = [WriteSystem] in {
let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
let hasSideEffects = 1,
Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
"ldtilecfg\t$src",
[(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
let hasSideEffects = 1 in
def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
"sttilecfg\t$src",
[(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
let mayLoad = 1 in
def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
(ins sibmem:$src),
"tileloadd\t{$src, $dst|$dst, $src}", []>,
VEX, T8XD;
let mayLoad = 1 in
def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
(ins sibmem:$src),
"tileloaddt1\t{$src, $dst|$dst, $src}", []>,
@ -34,6 +38,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
"tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
let mayStore = 1 in
def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
(ins sibmem:$dst, TILE:$src),
"tilestored\t{$src, $dst|$dst, $src}", []>,
@ -42,6 +47,22 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
"tilezero\t$dst", []>,
VEX, T8XD;
// Pseduo instruction for RA.
let hasSideEffects = 1, mayLoad = 1,
Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
let hasSideEffects = 1, mayStore = 1 in
def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
GR16:$src2,
opaquemem:$src3,
TILECFG:$cfg), []>;
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
GR16:$src2, opaquemem:$src3,
TILE:$src4, TILECFG:$cfg), []>;
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
@ -76,6 +97,12 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
VEX_4V, T8PS;
}
// Pseduo instruction for RA.
let Constraints = "$src4 = $dst" in
def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
GR16:$src2, GR16:$src3, TILE:$src4,
TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp

View File

@ -3796,13 +3796,31 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const MachineFunction &MF = *MBB.getParent();
assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
"Stack slot too small for store");
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
RI.canRealignStack(MF);
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
.addReg(SrcReg, getKillRegState(isKill));
if (RC->getID() == X86::TILERegClassID) {
unsigned Opc = X86::TILESTORED;
// tilestored %tmm, (%sp, %idx)
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
MachineInstr *NewMI =
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
.addReg(SrcReg, getKillRegState(isKill));
MachineOperand &MO = NewMI->getOperand(2);
MO.setReg(VirtReg);
MO.setIsKill(true);
} else if (RC->getID() == X86::TILECFGRegClassID) {
unsigned Opc = X86::PSTTILECFG;
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
.addReg(SrcReg, getKillRegState(isKill));
} else {
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
RI.canRealignStack(MF);
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
.addReg(SrcReg, getKillRegState(isKill));
}
}
void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@ -3810,13 +3828,32 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
Register DestReg, int FrameIdx,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
const MachineFunction &MF = *MBB.getParent();
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
RI.canRealignStack(MF);
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx);
if (RC->getID() == X86::TILERegClassID) {
unsigned Opc = X86::TILELOADD;
// tileloadd (%sp, %idx), %tmm
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
MachineInstr *NewMI =
BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
FrameIdx);
MachineOperand &MO = NewMI->getOperand(3);
MO.setReg(VirtReg);
MO.setIsKill(true);
} else if (RC->getID() == X86::TILECFGRegClassID) {
unsigned Opc = X86::PLDTILECFG;
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
FrameIdx);
} else {
const MachineFunction &MF = *MBB.getParent();
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
RI.canRealignStack(MF);
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
FrameIdx);
}
}
bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,

View File

@ -0,0 +1,294 @@
//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file Pass to transform <256 x i32>
/// <256 x i32> is mapped to AMX tile register on X86, AMX instruction set only
/// provides simple operation on tile register. The basic elementwise operation
/// is not supported by AMX. Since we define the AMX tile as vector <256 x i32>
/// and only AMX intrinsics can operate on the type, we need transform
/// load/store <256 x i32> instruction to AMX load/store. Besides, we split
/// <256 x i32> to 2 <128 x i32> if the vector is not used or defined by AMX
/// intrinsics, so that in instruction selection it can be lowered to proper
/// size which HW can support.
//
//===----------------------------------------------------------------------===//
//
#include "X86.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
using namespace llvm;
#define DEBUG_TYPE "lower-amx-type"
namespace {
class X86LowerAMXType {
Function &Func;
const DataLayout &DL;
DenseSet<Instruction *> LDSet;
DenseSet<Instruction *> STSet;
DenseMap<Value *, std::pair<LoadInst *, LoadInst *>> LoadMap;
public:
X86LowerAMXType(Function &F) : Func(F), DL(F.getParent()->getDataLayout()) {}
bool visit();
bool visitLD();
bool visitST();
void splitST(Instruction *Inst);
void splitLD(Instruction *Inst);
};
// Split v256i32 load/store to 2 v128i32, so that ISel can
// lower it to proper vector size.
void X86LowerAMXType::splitST(Instruction *Inst) {
StoreInst *ST = dyn_cast<StoreInst>(Inst);
IRBuilder<> Builder(ST);
LLVMContext &Ctx = Builder.getContext();
Type *Ty = ST->getValueOperand()->getType();
EVT VT = EVT::getEVT(Ty);
EVT HalfVT = VT.getHalfNumVectorElementsVT(Ctx);
Type *HalfTy = HalfVT.getTypeForEVT(Ctx);
LoadInst *Lo, *Hi;
std::tie(Lo, Hi) = LoadMap[ST->getValueOperand()];
Value *Ptr = ST->getPointerOperand();
PointerType *HalfPtrTy = HalfTy->getPointerTo(ST->getPointerAddressSpace());
Value *HalfPtr = Builder.CreateBitCast(Ptr, HalfPtrTy);
// The HW require the alignment for AMX tile is 64, but front-end generate
// code for the vector alignment which is the vector size.
uint64_t HalfTySize = HalfTy->getPrimitiveSizeInBits().getFixedSize() / 8;
Align Alignment = std::min(Lo->getAlign(), Align(HalfTySize));
Builder.CreateAlignedStore(Lo, HalfPtr, Alignment, ST->isVolatile());
HalfPtr = Builder.CreateGEP(HalfTy, HalfPtr, Builder.getInt32(1));
Builder.CreateAlignedStore(Hi, HalfPtr, Alignment, ST->isVolatile());
}
bool X86LowerAMXType::visitST() {
if (STSet.empty())
return false;
for (auto *Inst : STSet) {
Value *Row, *Col;
const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst->getOperand(0));
if (!II)
Row = Col = nullptr;
else {
switch (II->getIntrinsicID()) {
default:
Row = Col = nullptr;
break;
case Intrinsic::x86_tileloadd64_internal:
case Intrinsic::x86_tdpbssd_internal: {
Row = II->getArgOperand(0);
Col = II->getArgOperand(1);
break;
}
}
}
if (!Row) {
splitST(Inst);
continue;
}
IRBuilder<> Builder(Inst);
LLVMContext &Ctx = Builder.getContext();
// Use the maximun column as stride. It must be the same with load stride.
Value *Stride = Builder.getInt64(64);
Value *I8Ptr =
Builder.CreateBitCast(Inst->getOperand(1), Type::getInt8PtrTy(Ctx));
std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride,
Inst->getOperand(0)};
Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
}
return true;
}
void X86LowerAMXType::splitLD(Instruction *Inst) {
LoadInst *LD = dyn_cast<LoadInst>(Inst);
IRBuilder<> Builder(LD);
LLVMContext &Ctx = Builder.getContext();
Type *Ty = LD->getType();
EVT VT = EVT::getEVT(Ty);
EVT HalfVT = VT.getHalfNumVectorElementsVT(Ctx);
Type *HalfTy = HalfVT.getTypeForEVT(Ctx);
Value *Ptr = LD->getPointerOperand();
PointerType *HalfPtrTy = HalfTy->getPointerTo(LD->getPointerAddressSpace());
Value *HalfPtr = Builder.CreateBitCast(Ptr, HalfPtrTy);
// The HW require the alignment for AMX tile is 64, but front-end generate
// code for the vector alignment which is the vector size.
uint64_t HalfTySize = HalfTy->getPrimitiveSizeInBits().getFixedSize() / 8;
Align Alignment = std::min(LD->getAlign(), Align(HalfTySize));
auto *Lo =
Builder.CreateAlignedLoad(HalfTy, HalfPtr, Alignment, LD->isVolatile());
HalfPtr = Builder.CreateGEP(HalfTy, HalfPtr, Builder.getInt32(1));
auto *Hi =
Builder.CreateAlignedLoad(HalfTy, HalfPtr, Alignment, LD->isVolatile());
LoadMap[Inst] = std::make_pair(Lo, Hi);
}
bool X86LowerAMXType::visitLD() {
if (LDSet.empty())
return false;
for (auto &Inst : LDSet) {
int Count = 0;
Value *NewInst = nullptr;
// The user should be all AMX intrinsics or all LLVM instruction.
// Don't support it is used by both AMX intrinsics and LLVM instructions.
for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) {
Use &U = *I++;
const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U.getUser());
if (!II) {
Count++;
continue;
}
if (NewInst)
continue;
Value *Row, *Col;
switch (II->getIntrinsicID()) {
default:
report_fatal_error("Non-AMX intrinsic use tile type.");
break;
case Intrinsic::x86_tdpbssd_internal: {
unsigned OpNo = U.getOperandNo();
switch (OpNo) {
case 3:
Row = II->getArgOperand(0);
Col = II->getArgOperand(1);
break;
case 4:
Row = II->getArgOperand(0);
Col = II->getArgOperand(2);
break;
case 5:
Row = II->getArgOperand(2);
Col = II->getArgOperand(1);
break;
}
break;
}
case Intrinsic::x86_tilestored64_internal: {
Row = II->getArgOperand(0);
Col = II->getArgOperand(1);
break;
}
}
assert(Count == 0 && "Can NOT mix amx intrinsic and LLVM instruction");
// FIXME: The shape def should be ahead of load.
IRBuilder<> Builder(Inst);
LLVMContext &Ctx = Builder.getContext();
// Use the maximun column as stride.
Value *Stride = Builder.getInt64(64);
Value *I8Ptr =
Builder.CreateBitCast(Inst->getOperand(0), Type::getInt8PtrTy(Ctx));
std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal,
None, Args);
Inst->replaceAllUsesWith(NewInst);
}
if (!NewInst)
splitLD(Inst);
}
return true;
}
bool X86LowerAMXType::visit() {
bool C;
auto IsAMXType = [](FixedVectorType *VTy) {
if (!VTy)
return false;
if (!VTy->getScalarType()->isIntegerTy(32))
return false;
if (VTy->getNumElements() != 256)
return false;
return true;
};
for (BasicBlock &BB : Func) {
for (Instruction &Inst : BB) {
LoadInst *LD = dyn_cast<LoadInst>(&Inst);
// Check load instruction.
// %3 = load <256 x i32>, <256 x i32>* %1, align 64
if (LD) {
FixedVectorType *VTy = dyn_cast<FixedVectorType>(Inst.getType());
if (!IsAMXType(VTy))
continue;
LDSet.insert(&Inst);
continue;
}
// Check store instruction.
// store <256 x i32> %3, <256 x i32>* %2, align 64
StoreInst *ST = dyn_cast<StoreInst>(&Inst);
if (!ST)
continue;
FixedVectorType *VTy =
dyn_cast<FixedVectorType>(ST->getOperand(0)->getType());
if (!IsAMXType(VTy))
continue;
STSet.insert(&Inst);
}
}
C = visitLD() | visitST();
for (auto *Inst : STSet)
Inst->eraseFromParent();
for (auto *Inst : LDSet)
Inst->eraseFromParent();
return C;
}
} // anonymous namespace
namespace {
class X86LowerAMXTypeLegacyPass : public FunctionPass {
public:
static char ID;
X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {
initializeX86LowerAMXTypeLegacyPassPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override {
X86LowerAMXType LAT(F);
bool C = LAT.visit();
return C;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
}
};
} // anonymous namespace
static const char PassName[] = "Lower AMX type for load/store";
char X86LowerAMXTypeLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
false)
INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
false)
FunctionPass *llvm::createX86LowerAMXTypePass() {
return new X86LowerAMXTypeLegacyPass();
}

View File

@ -0,0 +1,263 @@
//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file Pass to pre-config the shape of AMX register
/// AMX register need to be configured before use. The shape of AMX register
/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
/// The pldtilecfg is to config tile registers. It should dominator all AMX
/// instructions. The pldtilecfg produce a virtual cfg register and the cfg
/// register is used by all AMX instructions.
/// This pass is to find the common dominator of all AMX instructions and
/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg
/// produces is inserted as the last operand of each AMX instruction. We use
/// this scheme to model the def-use relationship between AMX config instruction
/// and other AMX instructions. Below is an example.
///
/// ----B1----
/// / \
/// / \
/// B2 B3
/// %1:tile = PTILELOADDV %2:tile = PTILELOADDV
///
/// is transformed to
///
/// B1
/// %25:tilecfg = PLDTILECFG
/// / \
/// / \
/// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25
//
//===----------------------------------------------------------------------===//
#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TileShapeInfo.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
#define DEBUG_TYPE "tile-pre-config"
namespace {
class X86PreTileConfig : public MachineFunctionPass {
// context
MachineFunction *MF = nullptr;
const X86Subtarget *ST = nullptr;
const TargetRegisterInfo *TRI;
const TargetInstrInfo *TII;
MachineDominatorTree *DomTree = nullptr;
MachineRegisterInfo *MRI = nullptr;
MachineInstr *getTileConfigPoint();
public:
X86PreTileConfig() : MachineFunctionPass(ID) {}
/// Return the pass name.
StringRef getPassName() const override {
return "Tile Register Pre-configure";
}
/// X86PreTileConfig analysis usage.
void getAnalysisUsage(AnalysisUsage &AU) const override;
/// Perform register allocation.
bool runOnMachineFunction(MachineFunction &mf) override;
static char ID;
};
} // end anonymous namespace
char X86PreTileConfig::ID = 0;
INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
"Tile Register Configure", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
"Tile Register Configure", false, false)
void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
AU.addRequired<MachineDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
const TargetInstrInfo *TII,
MachineRegisterInfo *MRI,
const X86Subtarget *ST) {
auto *MBB = MI->getParent();
// FIXME: AMX should assume AVX512 enabled.
if (ST->hasAVX512()) {
// Zero stack slot.
Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
.addReg(Zmm, RegState::Undef)
.addReg(Zmm, RegState::Undef);
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
FrameIdx)
.addReg(Zmm);
}
// build psuedo ldtilecfg
Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
addFrameReference(
BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
return VReg;
}
static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
default:
llvm_unreachable("Unexpected machine instruction on tile");
case X86::PTILELOADDV:
case X86::PTDPBSSDV:
MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
ShapeT Shape(&MO1, &MO2, MRI);
return Shape;
}
}
MachineInstr *X86PreTileConfig::getTileConfigPoint() {
DenseMap<Register, ShapeT> PhysShapeInfo;
MachineBasicBlock *MBB = nullptr;
DenseSet<const MachineInstr *> MIs;
for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
Register VirtReg = Register::index2VirtReg(i);
if (MRI->reg_nodbg_empty(VirtReg))
continue;
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
if (RC.getID() != X86::TILERegClassID)
continue;
// Find the common dominator for all MI that define tile register.
for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
if (MO.isUndef())
continue;
const auto *MI = MO.getParent();
// PHI or IMPLICIT_DEF instructiion.
// There must be a input tile before PHI instruction.
if (MI->isTransient())
continue;
if (!MBB)
MBB = const_cast<MachineBasicBlock *>(MI->getParent());
MBB = DomTree->findNearestCommonDominator(
MBB, const_cast<MachineBasicBlock *>(MI->getParent()));
// Collect the instructions that define shape.
ShapeT Shape = getShape(*MI, MRI);
std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(),
Shape.getCol()};
for (auto *ShapeMO : ShapeMOs) {
Register ShapeReg = ShapeMO->getReg();
for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) {
const auto *ShapeMI = MO.getParent();
MIs.insert(ShapeMI);
}
}
}
}
if (!MBB)
return nullptr;
// This pass is before the pass of eliminating PHI node, so it
// is in SSA form.
assert(MRI->isSSA() && "Not SSA form in pre-tile config");
// Shape def should dominate tile config MBB.
// def s s1 s2
// / \ \ /
// / \ \ /
// conf s3=phi(s1,s2)
// |
// c
//
for (const auto *MI : MIs) {
const MachineBasicBlock *ShapeMBB = MI->getParent();
if (DomTree->dominates(ShapeMBB, MBB))
continue;
if (MI->isMoveImmediate())
continue;
report_fatal_error(MF->getName() + ": Failed to config tile register, "
"please define the shape earlier");
}
// ldtilecfg should be inserted after the MI that define the shape.
MachineBasicBlock::reverse_instr_iterator I, E;
for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) {
auto *MI = &*I;
if (MIs.count(MI) && (!MI->isMoveImmediate()))
break;
}
MachineBasicBlock::iterator MII;
if (I == E)
MII = MBB->getFirstNonPHI();
else {
MII = MachineBasicBlock::iterator(&*I);
MII++;
}
return &*MII;
}
static void addTileCFGUse(MachineFunction &MF, Register CFG) {
for (MachineBasicBlock &MBB : MF) {
// Traverse the basic block.
for (MachineInstr &MI : MBB) {
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
default:
break;
case X86::PTILELOADDV:
case X86::PTILESTOREDV:
case X86::PTDPBSSDV:
unsigned NumOperands = MI.getNumOperands();
MI.RemoveOperand(NumOperands - 1);
MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
break;
}
}
}
}
bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
MF = &mf;
MRI = &mf.getRegInfo();
ST = &mf.getSubtarget<X86Subtarget>();
TRI = ST->getRegisterInfo();
TII = mf.getSubtarget().getInstrInfo();
DomTree = &getAnalysis<MachineDominatorTree>();
MachineInstr *MI = getTileConfigPoint();
if (!MI)
return false;
unsigned Size = ST->getTileConfigSize();
Align Alignment = ST->getTileConfigAlignment();
int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
addTileCFGUse(mf, CFG);
return true;
}
FunctionPass *llvm::createX86PreTileConfigPass() {
return new X86PreTileConfig();
}

View File

@ -19,6 +19,7 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@ -856,3 +857,78 @@ X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const {
StackReg = getX86SubSuperRegister(StackReg, 32);
return StackReg;
}
static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
const MachineRegisterInfo *MRI) {
if (VRM->hasShape(VirtReg))
return VRM->getShape(VirtReg);
const MachineOperand &Def = *MRI->def_begin(VirtReg);
MachineInstr *MI = const_cast<MachineInstr *>(Def.getParent());
unsigned OpCode = MI->getOpcode();
switch (OpCode) {
default:
llvm_unreachable("Unexpected machine instruction on tile register!");
break;
// We only collect the tile shape that is defined.
case X86::PTILELOADDV:
case X86::PTDPBSSDV:
MachineOperand &MO1 = MI->getOperand(1);
MachineOperand &MO2 = MI->getOperand(2);
ShapeT Shape(&MO1, &MO2, MRI);
VRM->assignVirt2Shape(VirtReg, Shape);
return Shape;
}
}
bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
ArrayRef<MCPhysReg> Order,
SmallVectorImpl<MCPhysReg> &Hints,
const MachineFunction &MF,
const VirtRegMap *VRM,
const LiveRegMatrix *Matrix) const {
const MachineRegisterInfo *MRI = &MF.getRegInfo();
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
VirtReg, Order, Hints, MF, VRM, Matrix);
if (RC.getID() != X86::TILERegClassID)
return BaseImplRetVal;
ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI);
auto AddHint = [&](MCPhysReg PhysReg) {
Register VReg = Matrix->getOneVReg(PhysReg);
if (VReg == MCRegister::NoRegister) { // Not allocated yet
Hints.push_back(PhysReg);
return;
}
ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
if (PhysShape == VirtShape)
Hints.push_back(PhysReg);
};
SmallSet<MCPhysReg, 4> CopyHints;
CopyHints.insert(Hints.begin(), Hints.end());
Hints.clear();
for (auto Hint : CopyHints) {
if (RC.contains(Hint) && !MRI->isReserved(Hint))
AddHint(Hint);
}
for (MCPhysReg PhysReg : Order) {
if (!CopyHints.count(PhysReg) && RC.contains(PhysReg) &&
!MRI->isReserved(PhysReg))
AddHint(PhysReg);
}
#define DEBUG_TYPE "tile-hint"
LLVM_DEBUG({
dbgs() << "Hints for virtual register " << format_hex(VirtReg, 8) << "\n";
for (auto Hint : Hints) {
dbgs() << "tmm" << Hint << ",";
}
dbgs() << "\n";
});
#undef DEBUG_TYPE
return true;
}

View File

@ -144,6 +144,11 @@ public:
Register getFramePtr() const { return FramePtr; }
// FIXME: Move to FrameInfok
unsigned getSlotSize() const { return SlotSize; }
bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
SmallVectorImpl<MCPhysReg> &Hints,
const MachineFunction &MF, const VirtRegMap *VRM,
const LiveRegMatrix *Matrix) const override;
};
} // End llvm namespace

View File

@ -265,6 +265,9 @@ let SubRegIndices = [sub_ymm] in {
}
}
// Tile config registers.
def TMMCFG: X86Reg<"tmmcfg", 0>;
// Tile "registers".
def TMM0: X86Reg<"tmm0", 0>;
def TMM1: X86Reg<"tmm1", 1>;
@ -633,6 +636,11 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
// Tiles
let isAllocatable = 0 in
def TILE : RegisterClass<"X86", [untyped], 0,
let CopyCost = -1 in // Don't allow copying of tile registers
def TILE : RegisterClass<"X86", [v256i32], 8192,
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
let CopyCost = -1; // Don't allow copying of tile config registers.
let isAllocatable = 1;
let Size = 512;
}

View File

@ -472,6 +472,8 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// entry to the function and which must be maintained by every function.
Align stackAlignment = Align(4);
Align TileConfigAlignment = Align(4);
/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
///
// FIXME: this is a known good value for Yonah. How about others?
@ -555,6 +557,9 @@ public:
return &getInstrInfo()->getRegisterInfo();
}
unsigned getTileConfigSize() const { return 64; }
Align getTileConfigAlignment() const { return TileConfigAlignment; }
/// Returns the minimum alignment known to hold of the
/// stack frame on entry to the function and which must be maintained by every
/// function for this subtarget.

View File

@ -62,6 +62,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());
PassRegistry &PR = *PassRegistry::getPassRegistry();
initializeX86LowerAMXTypeLegacyPassPass(PR);
initializeGlobalISel(PR);
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
@ -71,6 +72,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeX86FixupSetCCPassPass(PR);
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
initializeX86TileConfigPass(PR);
initializeX86ExpandPseudoPass(PR);
initializeX86ExecutionDomainFixPass(PR);
initializeX86DomainReassignmentPass(PR);
@ -379,6 +381,7 @@ public:
void addPreEmitPass() override;
void addPreEmitPass2() override;
void addPreSched2() override;
bool addPreRewrite() override;
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
@ -407,6 +410,7 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
void X86PassConfig::addIRPasses() {
addPass(createAtomicExpandPass());
addPass(createX86LowerAMXTypePass());
TargetPassConfig::addIRPasses();
@ -492,7 +496,12 @@ void X86PassConfig::addPreRegAlloc() {
addPass(createX86SpeculativeLoadHardeningPass());
addPass(createX86FlagsCopyLoweringPass());
addPass(createX86WinAllocaExpander());
if (getOptLevel() != CodeGenOpt::None) {
addPass(createX86PreTileConfigPass());
}
}
void X86PassConfig::addMachineSSAOptimization() {
addPass(createX86DomainReassignmentPass());
TargetPassConfig::addMachineSSAOptimization();
@ -565,6 +574,11 @@ void X86PassConfig::addPreEmitPass2() {
addPass(createX86LoadValueInjectionRetHardeningPass());
}
bool X86PassConfig::addPreRewrite() {
addPass(createX86TileConfigPass());
return true;
}
std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
return getStandardCSEConfigForOpt(TM->getOptLevel());
}

View File

@ -0,0 +1,248 @@
//===-- X86TileConfig.cpp - Tile Register Configure----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file Pass to config the shape of AMX physical registers
/// AMX register need to be configured before use. In X86PreTileConfig pass
/// the pldtilecfg instruction is inserted, however at that time we don't
/// know the shape of each physical tile registers, because the register
/// allocation is not done yet. This pass runs after egister allocation
/// pass. It collects the shape information of each physical tile register
/// and store the shape in the stack slot that is allocated for load config
/// to tile config register.
//
//===----------------------------------------------------------------------===//
#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TileShapeInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
#define DEBUG_TYPE "tile-config"
namespace {
class X86TileConfig : public MachineFunctionPass {
// context
MachineFunction *MF = nullptr;
const X86Subtarget *ST = nullptr;
const TargetRegisterInfo *TRI;
const TargetInstrInfo *TII;
MachineDominatorTree *DomTree = nullptr;
MachineRegisterInfo *MRI = nullptr;
VirtRegMap *VRM = nullptr;
LiveIntervals *LIS = nullptr;
MachineInstr *getTileConfigPoint();
void tileConfig();
public:
X86TileConfig() : MachineFunctionPass(ID) {}
/// Return the pass name.
StringRef getPassName() const override { return "Tile Register Configure"; }
/// X86TileConfig analysis usage.
void getAnalysisUsage(AnalysisUsage &AU) const override;
/// Perform register allocation.
bool runOnMachineFunction(MachineFunction &mf) override;
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoPHIs);
}
static char ID;
};
} // end anonymous namespace
char X86TileConfig::ID = 0;
INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure",
false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure",
false, false)
void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<MachineDominatorTree>();
AU.addRequired<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
AU.addRequired<VirtRegMap>();
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
static unsigned getTilePhysRegIndex(Register PhysReg) {
assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) &&
"Tile register number is invalid");
return (PhysReg - X86::TMM0);
}
static MachineInstr *
storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
Register SrcReg, unsigned BitSize, int FrameIdx, int Offset,
const TargetInstrInfo *TII, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) {
unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit;
unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr;
if (BitSize == TRI->getRegSizeInBits(*RC))
SubIdx = 0;
MachineInstr *NewMI =
addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx,
Offset)
.addReg(SrcReg, 0, SubIdx);
return NewMI;
}
static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
int64_t Imm, unsigned BitSize,
int FrameIdx, int Offset,
const TargetInstrInfo *TII) {
unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi;
return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)),
FrameIdx, Offset)
.addImm(Imm);
}
MachineInstr *X86TileConfig::getTileConfigPoint() {
for (MachineBasicBlock &MBB : *MF) {
// Traverse the basic block.
for (MachineInstr &MI : MBB)
// Refer X86PreTileConfig.cpp.
// We only support one tile config for now.
if (MI.getOpcode() == X86::PLDTILECFG)
return &MI;
}
return nullptr;
}
void X86TileConfig::tileConfig() {
MachineInstr *MI = getTileConfigPoint();
if (!MI)
return;
MachineBasicBlock *MBB = MI->getParent();
int SS = MI->getOperand(1).getIndex();
BitVector PhysRegs(TRI->getNumRegs());
// Fill in the palette first.
auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII);
LIS->InsertMachineInstrInMaps(*NewMI);
// Fill in the shape of each tile physical register.
for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
Register VirtReg = Register::index2VirtReg(i);
if (MRI->reg_nodbg_empty(VirtReg))
continue;
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
if (RC.getID() != X86::TILERegClassID)
continue;
Register PhysReg = VRM->getPhys(VirtReg);
if (PhysRegs.test(PhysReg))
continue;
PhysRegs.set(PhysReg);
ShapeT Shape = VRM->getShape(VirtReg);
Register RowReg = Shape.getRow()->getReg();
Register ColReg = Shape.getCol()->getReg();
// Here is the data format for the tile config.
// 0 palette
// 1 start_row
// 2-15 reserved, must be zero
// 16-17 tile0.colsb Tile 0 bytes per row.
// 18-19 tile1.colsb Tile 1 bytes per row.
// 20-21 tile2.colsb Tile 2 bytes per row.
// ... (sequence continues)
// 30-31 tile7.colsb Tile 7 bytes per row.
// 32-47 reserved, must be zero
// 48 tile0.rows Tile 0 rows.
// 49 tile1.rows Tile 1 rows.
// 50 tile2.rows Tile 2 rows.
// ... (sequence continues)
// 55 tile7.rows Tile 7 rows.
// 56-63 reserved, must be zero
unsigned Index = getTilePhysRegIndex(PhysReg);
int RowOffset = 48 + Index;
int ColOffset = 16 + Index * 2;
unsigned BitSize = 8;
for (const auto &Pair : {std::make_pair(RowReg, RowOffset),
std::make_pair(ColReg, ColOffset)}) {
int64_t Imm;
int ImmCount = 0;
// All def must be the same value, otherwise it is invalid MIs.
// Immediate is prefered.
for (const MachineOperand &MO : MRI->def_operands(Pair.first)) {
const auto *Inst = MO.getParent();
if (Inst->isMoveImmediate()) {
ImmCount++;
Imm = Inst->getOperand(1).getImm();
break;
}
}
auto StoreConfig = [&](int Offset) {
MachineInstr *NewMI = nullptr;
if (ImmCount)
NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII);
else {
const TargetRegisterClass *RC = MRI->getRegClass(Pair.first);
NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS,
Offset, TII, RC, TRI);
}
SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI);
if (!ImmCount) {
// Extend the live interval.
SmallVector<SlotIndex, 8> EndPoints = {SIdx.getRegSlot()};
LiveInterval &Int = LIS->getInterval(Pair.first);
LIS->extendToIndices(Int, EndPoints);
}
};
StoreConfig(Pair.second);
BitSize += 8;
}
}
}
bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) {
MF = &mf;
MRI = &mf.getRegInfo();
ST = &mf.getSubtarget<X86Subtarget>();
TRI = ST->getRegisterInfo();
TII = mf.getSubtarget().getInstrInfo();
DomTree = &getAnalysis<MachineDominatorTree>();
VRM = &getAnalysis<VirtRegMap>();
LIS = &getAnalysis<LiveIntervals>();
if (VRM->isShapeMapEmpty())
return false;
tileConfig();
return true;
}
FunctionPass *llvm::createX86TileConfigPass() { return new X86TileConfig(); }

View File

@ -0,0 +1,91 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
@buf = dso_local global [3072 x i8] zeroinitializer, align 16
define dso_local void @test_api(i16 signext %0, i16 signext %1) local_unnamed_addr #2 {
; CHECK-LABEL: test_api:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 40
; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8
; CHECK-NEXT: .cfi_def_cfa_offset 4096
; CHECK-NEXT: .cfi_offset %rbx, -40
; CHECK-NEXT: .cfi_offset %r14, -32
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: movl %edi, %ebp
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
; CHECK-NEXT: movl $buf, %eax
; CHECK-NEXT: movl $32, %r14d
; CHECK-NEXT: movw $8, %r15w
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm1, 2048(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: movl $buf+1024, %eax
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: movl $buf+2048, %eax
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
; CHECK-NEXT: movabsq $64, %rcx
; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload
; CHECK-NEXT: movabsq $64, %rcx
; CHECK-NEXT: tileloadd 1024(%rsp,%rcx), %tmm2 # 1024-byte Folded Reload
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
; CHECK-NEXT: tilestored %tmm0, (%rax,%r14)
; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8
; CHECK-NEXT: .cfi_def_cfa_offset 40
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: popq %r14
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: popq %r15
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: tilerelease
; CHECK-NEXT: retq
%3 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) #4
%4 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) #4
tail call void (...) @foo() #4
%5 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) #4
%6 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, <256 x i32> %5, <256 x i32> %3, <256 x i32> %4) #4
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, <256 x i32> %6) #4
ret void
}
declare dso_local void @foo(...) local_unnamed_addr #3
declare <256 x i32> @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #4
declare <256 x i32> @llvm.x86.tdpbssd.internal(i16, i16, i16, <256 x i32>, <256 x i32>, <256 x i32>) #4
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, <256 x i32>) #4
attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="8192" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { nounwind }

View File

@ -0,0 +1,77 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -verify-machineinstrs | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
; Function Attrs: nounwind uwtable
define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) local_unnamed_addr #2 {
; CHECK-LABEL: test_api:
; CHECK: # %bb.0:
; CHECK-NEXT: movsbl %sil, %eax
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: je .LBB0_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: movl $buf, %ecx
; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: movl $buf2, %ecx
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: movl $32, %edi
; CHECK-NEXT: tileloadd (%rcx,%rdi), %tmm0
; CHECK-NEXT: tileloadd (%rcx,%rdi), %tmm2
; CHECK-NEXT: tileloadd (%rcx,%rdi), %tmm1
; CHECK-NEXT: tdpbssd %tmm2, %tmm0, %tmm1
; CHECK-NEXT: movl $buf, %ecx
; CHECK-NEXT: movl $32, %esi
; CHECK-NEXT: tilestored %tmm1, (%rcx,%rsi)
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%4 = icmp eq i32 %0, 0
%5 = shl i16 %1, 8
%6 = ashr exact i16 %5, 8
br i1 %4, label %11, label %7
7: ; preds = %3
%8 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
%9 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
%10 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
br label %15
11: ; preds = %3
%12 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
%13 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
%14 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
br label %15
15: ; preds = %11, %7
%16 = phi <256 x i32> [ %12, %11 ], [ %8, %7 ]
%17 = phi <256 x i32> [ %13, %11 ], [ %9, %7 ]
%18 = phi <256 x i32> [ %14, %11 ], [ %10, %7 ]
%19 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %6, i16 %2, i16 %1, <256 x i32> %18, <256 x i32> %16, <256 x i32> %17) #3
tail call void @llvm.x86.tilestored64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, <256 x i32> %19) #3
ret void
}
declare <256 x i32> @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #3
declare <256 x i32> @llvm.x86.tdpbssd.internal(i16, i16, i16, <256 x i32>, <256 x i32>, <256 x i32>) #3
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, <256 x i32>) #3
attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="8192" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+avx,+avx2,+avx512f,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind }

View File

@ -0,0 +1,112 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) local_unnamed_addr #2 {
; CHECK-LABEL: test_api:
; CHECK: # %bb.0:
; CHECK-NEXT: subq $2936, %rsp # imm = 0xB78
; CHECK-NEXT: .cfi_def_cfa_offset 2944
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %si, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $buf, %r8d
; CHECK-NEXT: movl $32, %eax
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1
; CHECK-NEXT: movabsq $64, %rcx
; CHECK-NEXT: tilestored %tmm1, 896(%rsp,%rcx) # 1024-byte Folded Spill
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm3
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm4
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm2
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm5
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm0
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: je .LBB0_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm6
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm7
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1
; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: movl $buf2, %ecx
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm6
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm7
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: tdpbssd %tmm7, %tmm6, %tmm1
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd 896(%rsp,%rax), %tmm7 # 1024-byte Folded Reload
; CHECK-NEXT: tdpbssd %tmm7, %tmm1, %tmm3
; CHECK-NEXT: tdpbssd %tmm4, %tmm3, %tmm2
; CHECK-NEXT: tdpbssd %tmm5, %tmm2, %tmm0
; CHECK-NEXT: movl $buf, %eax
; CHECK-NEXT: movl $32, %ecx
; CHECK-NEXT: tilestored %tmm0, (%rax,%rcx)
; CHECK-NEXT: addq $2936, %rsp # imm = 0xB78
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%4 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
%5 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
%6 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
%7 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
%8 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
%9 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
%10 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
%11 = icmp eq i32 %0, 0
br i1 %11, label %16, label %12
12: ; preds = %3
%13 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
%14 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
%15 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
br label %20
16: ; preds = %3
%17 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
%18 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
%19 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
br label %20
20: ; preds = %16, %12
%21 = phi <256 x i32> [ %17, %16 ], [ %13, %12 ]
%22 = phi <256 x i32> [ %18, %16 ], [ %14, %12 ]
%23 = phi <256 x i32> [ %19, %16 ], [ %15, %12 ]
%24 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %1, <256 x i32> %23, <256 x i32> %21, <256 x i32> %22) #3
%25 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %2, <256 x i32> %6, <256 x i32> %24, <256 x i32> %5) #3
%26 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %2, <256 x i32> %8, <256 x i32> %25, <256 x i32> %7) #3
%27 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %2, i16 %2, i16 %2, <256 x i32> %10, <256 x i32> %26, <256 x i32> %9) #3
tail call void @llvm.x86.tilestored64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, <256 x i32> %27) #3
ret void
}
declare <256 x i32> @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #3
declare <256 x i32> @llvm.x86.tdpbssd.internal(i16, i16, i16, <256 x i32>, <256 x i32>, <256 x i32>) #3
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, <256 x i32>) #3
attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="8192" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind }

View File

@ -0,0 +1,143 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -lower-amx-type %s -S | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct.__tile_str = type { i16, i16, <256 x i32> }
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
define dso_local void @test_load(i8* %in, i8* %out) local_unnamed_addr #2 {
; CHECK-LABEL: @test_load(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[IN:%.*]] to <256 x i32>*
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[OUT:%.*]] to <256 x i32>*
; CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i32>* [[TMP1]] to <128 x i32>*
; CHECK-NEXT: [[TMP4:%.*]] = load <128 x i32>, <128 x i32>* [[TMP3]], align 64
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <128 x i32>, <128 x i32>* [[TMP3]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = load <128 x i32>, <128 x i32>* [[TMP5]], align 64
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <256 x i32>* [[TMP2]] to <128 x i32>*
; CHECK-NEXT: store <128 x i32> [[TMP4]], <128 x i32>* [[TMP7]], align 64
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <128 x i32>, <128 x i32>* [[TMP7]], i32 1
; CHECK-NEXT: store <128 x i32> [[TMP6]], <128 x i32>* [[TMP8]], align 64
; CHECK-NEXT: ret void
;
%1 = bitcast i8* %in to <256 x i32>*
%2 = bitcast i8* %out to <256 x i32>*
%3 = load <256 x i32>, <256 x i32>* %1, align 64, !tbaa !8
store <256 x i32> %3, <256 x i32>* %2, align 64, !tbaa !8
ret void
}
define dso_local void @__tile_loadd(%struct.__tile_str* nocapture %0, i8* %1, i64 %2) local_unnamed_addr #0 {
; CHECK-LABEL: @__tile_loadd(
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 0
; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP4]], align 64, [[TBAA2:!tbaa !.*]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0]], i64 0, i32 1
; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[TMP6]], align 2, [[TBAA7:!tbaa !.*]]
; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP2:%.*]], 32
; CHECK-NEXT: [[TMP9:%.*]] = ashr exact i64 [[TMP8]], 32
; CHECK-NEXT: [[TMP10:%.*]] = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP1:%.*]], i64 [[TMP9]]) [[ATTR3:#.*]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0]], i64 0, i32 2
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP11]] to i8*
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP12]], i64 64, <256 x i32> [[TMP10]])
; CHECK-NEXT: ret void
;
%4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 0
%5 = load i16, i16* %4, align 64, !tbaa !2
%6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 1
%7 = load i16, i16* %6, align 2, !tbaa !7
%8 = shl i64 %2, 32
%9 = ashr exact i64 %8, 32
%10 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %5, i16 %7, i8* %1, i64 %9) #3
%11 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 2
store <256 x i32> %10, <256 x i32>* %11, align 64, !tbaa !8
ret void
}
define dso_local void @__tile_dpbsud(%struct.__tile_str* nocapture %0, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %1, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr #0 {
; CHECK-LABEL: @__tile_dpbsud(
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP1:%.*]], i64 0, i32 0
; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP4]], align 64, [[TBAA2]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2:%.*]], i64 0, i32 1
; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[TMP6]], align 2, [[TBAA7]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 1
; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP8]], align 2, [[TBAA7]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 2
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <256 x i32>* [[TMP10]] to i8*
; CHECK-NEXT: [[TMP12:%.*]] = call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP11]], i64 64)
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 2
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <256 x i32>* [[TMP13]] to i8*
; CHECK-NEXT: [[TMP15:%.*]] = call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP9]], i8* [[TMP14]], i64 64)
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2
; CHECK-NEXT: [[TMP17:%.*]] = bitcast <256 x i32>* [[TMP16]] to i8*
; CHECK-NEXT: [[TMP18:%.*]] = call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP7]], i8* [[TMP17]], i64 64)
; CHECK-NEXT: [[TMP19:%.*]] = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 [[TMP5]], i16 [[TMP7]], i16 [[TMP9]], <256 x i32> [[TMP12]], <256 x i32> [[TMP15]], <256 x i32> [[TMP18]]) [[ATTR3]]
; CHECK-NEXT: [[TMP20:%.*]] = bitcast <256 x i32>* [[TMP10]] to i8*
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP20]], i64 64, <256 x i32> [[TMP19]])
; CHECK-NEXT: ret void
;
%4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 0
%5 = load i16, i16* %4, align 64, !tbaa !2
%6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 1
%7 = load i16, i16* %6, align 2, !tbaa !7
%8 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 1
%9 = load i16, i16* %8, align 2, !tbaa !7
%10 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 2
%11 = load <256 x i32>, <256 x i32>* %10, align 64, !tbaa !8
%12 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 2
%13 = load <256 x i32>, <256 x i32>* %12, align 64, !tbaa !8
%14 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 2
%15 = load <256 x i32>, <256 x i32>* %14, align 64, !tbaa !8
%16 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %5, i16 %7, i16 %9, <256 x i32> %11, <256 x i32> %13, <256 x i32> %15) #3
store <256 x i32> %16, <256 x i32>* %10, align 64, !tbaa !8
ret void
}
define dso_local void @__tile_stored(i8* %0, i64 %1, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr #1 {
; CHECK-LABEL: @__tile_stored(
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP2:%.*]], i64 0, i32 0
; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP4]], align 64, [[TBAA2]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 1
; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[TMP6]], align 2, [[TBAA7]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <256 x i32>* [[TMP8]] to i8*
; CHECK-NEXT: [[TMP10:%.*]] = call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP9]], i64 64)
; CHECK-NEXT: [[TMP11:%.*]] = shl i64 [[TMP1:%.*]], 32
; CHECK-NEXT: [[TMP12:%.*]] = ashr exact i64 [[TMP11]], 32
; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP0:%.*]], i64 [[TMP12]], <256 x i32> [[TMP10]]) [[ATTR3]]
; CHECK-NEXT: ret void
;
%4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 0
%5 = load i16, i16* %4, align 64, !tbaa !2
%6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 1
%7 = load i16, i16* %6, align 2, !tbaa !7
%8 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 2
%9 = load <256 x i32>, <256 x i32>* %8, align 64, !tbaa !8
%10 = shl i64 %1, 32
%11 = ashr exact i64 %10, 32
tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %7, i8* %0, i64 %11, <256 x i32> %9) #3
ret void
}
declare <256 x i32> @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #3
declare <256 x i32> @llvm.x86.tdpbssd.internal(i16, i16, i16, <256 x i32>, <256 x i32>, <256 x i32>) #3
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, <256 x i32>) #3
attributes #0 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="8192" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+avx,+avx2,+avx512f,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 12.0.0 (ssh://git-amr-1.devtools.intel.com:29418/dpd_icl-llvm_project_worldread f3c78a3f053379a2511e00e9ce2c13383ea3f835)"}
!2 = !{!3, !4, i64 0}
!3 = !{!"__tile_str", !4, i64 0, !4, i64 2, !5, i64 1024}
!4 = !{!"short", !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C/C++ TBAA"}
!7 = !{!3, !4, i64 2}
!8 = !{!5, !5, i64 0}

View File

@ -18,6 +18,7 @@
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Lower AMX type for load/store
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: Shadow Stack GC Lowering

View File

@ -3,7 +3,7 @@
target triple = "x86_64-unknown-unknown"
declare void @bar1()
define preserve_allcc void @foo()#0 {
; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $fpcw $fpsw $fs $gs $hip $ip $mxcsr $rip $riz $ss $ssp $bnd0 $bnd1 $bnd2 $bnd3 $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $r11b $r11bh $r11d $r11w $r11wh $k0_k1 $k2_k3 $k4_k5 $k6_k7
; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $fpcw $fpsw $fs $gs $hip $ip $mxcsr $rip $riz $ss $ssp $tmmcfg $bnd0 $bnd1 $bnd2 $bnd3 $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $r11b $r11bh $r11d $r11w $r11wh $k0_k1 $k2_k3 $k4_k5 $k6_k7
call void @bar1()
call void @bar2()
ret void

View File

@ -24,6 +24,7 @@
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Lower AMX type for load/store
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
@ -118,11 +119,12 @@
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: X86 EFLAGS copy lowering
; CHECK-NEXT: X86 WinAlloca Expander
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: Tile Register Pre-configure
; CHECK-NEXT: Detect Dead Lanes
; CHECK-NEXT: Process Implicit Definitions
; CHECK-NEXT: Remove unreachable machine basic blocks
; CHECK-NEXT: Live Variable Analysis
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: Machine Natural Loop Construction
; CHECK-NEXT: Eliminate PHI nodes for register allocation
; CHECK-NEXT: Two-Address instruction pass
@ -141,6 +143,7 @@
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Greedy Register Allocator
; CHECK-NEXT: Tile Register Configure
; CHECK-NEXT: Virtual Register Rewriter
; CHECK-NEXT: Stack Slot Coloring
; CHECK-NEXT: Machine Copy Propagation Pass

View File

@ -91,7 +91,7 @@ body: |
; CHECK-DAG: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $rdi :: (store 8 into %stack.1)
; CHECK: EH_LABEL <mcsymbol .Ltmp0>
; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK: STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 5, 2, 0, 2, -1, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on %stack.1), (load store 8 on %stack.0)
; CHECK: STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 5, 2, 0, 2, -1, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on %stack.0), (load store 8 on %stack.1)
; CHECK-DAG: $r14 = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load 8 from %stack.0)
; CHECK-DAG: $rbx = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load 8 from %stack.1)
; CHECK: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp

View File

@ -108,7 +108,7 @@ body: |
; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK: MOV64mr [[STACK0:%stack.[0-9]+]], 1, $noreg, 0, $noreg, killed $rbx :: (store 8 into [[STACK0]])
; CHECK: MOV64mr [[STACK1:%stack.[0-9]+]], 1, $noreg, 0, $noreg, killed $r14 :: (store 8 into [[STACK1]])
; CHECK: STATEPOINT 0, 0, 0, @foo, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, [[STACK0]], 0, 1, 8, [[STACK1]], 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on [[STACK0]]), (load store 8 on [[STACK1]])
; CHECK: STATEPOINT 0, 0, 0, @foo, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, [[STACK0]], 0, 1, 8, [[STACK1]], 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on [[STACK1]]), (load store 8 on [[STACK0]])
; CHECK-DAG: $rbx = MOV64rm [[STACK0]], 1, $noreg, 0, $noreg :: (load 8 from [[STACK0]])
; CHECK-DAG: $r14 = MOV64rm [[STACK1]], 1, $noreg, 0, $noreg :: (load 8 from [[STACK1]])
; CHECK: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
@ -121,7 +121,7 @@ body: |
; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-DAG: MOV64mr [[STACK0]], 1, $noreg, 0, $noreg, killed $rbx :: (store 8 into [[STACK0]])
; CHECK-DAG: MOV64mr [[STACK1]], 1, $noreg, 0, $noreg, killed $r14 :: (store 8 into [[STACK1]])
; CHECK: STATEPOINT 0, 0, 0, @bar, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, %stack.0, 0, 1, 8, [[STACK1]], 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on [[STACK0]]), (load store 8 on [[STACK1]])
; CHECK: STATEPOINT 0, 0, 0, @bar, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, %stack.0, 0, 1, 8, [[STACK1]], 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on [[STACK1]]), (load store 8 on [[STACK0]])
; CHECK-DAG: $rbx = MOV64rm [[STACK0]], 1, $noreg, 0, $noreg :: (load 8 from [[STACK0]])
; CHECK-DAG: $r14 = MOV64rm [[STACK1]], 1, $noreg, 0, $noreg :: (load 8 from [[STACK1]])
; CHECK: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp