[X86] AMX programming model.
This patch implements amx programming model that discussed in llvm-dev (http://lists.llvm.org/pipermail/llvm-dev/2020-August/144302.html). Thank Hal for the good suggestion in the RA. The fast RA is not in the patch yet. This patch implemeted 7 components. 1. The c interface to end user. 2. The AMX intrinsics in LLVM IR. 3. Transform load/store <256 x i32> to AMX intrinsics or split the type into two <128 x i32>. 4. The Lowering from AMX intrinsics to AMX pseudo instruction. 5. Insert psuedo ldtilecfg and build the def-use between ldtilecfg to amx intruction. 6. The register allocation for tile register. 7. Morph AMX pseudo instruction to AMX real instruction. Change-Id: I935e1080916ffcb72af54c2c83faa8b2e97d5cb0 Differential Revision: https://reviews.llvm.org/D87981
This commit is contained in:
parent
09f717b929
commit
f80b29878b
|
@ -100,6 +100,10 @@ TARGET_BUILTIN(__builtin_ia32_stui, "v", "n", "uintr")
|
|||
TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr")
|
||||
TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr")
|
||||
|
||||
// AMX internal builtin
|
||||
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
|
||||
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
|
||||
TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile")
|
||||
// AMX
|
||||
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
|
||||
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
|
||||
|
|
|
@ -15,8 +15,8 @@
|
|||
#define __AMXINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
|
||||
#define __DEFAULT_FN_ATTRS_TILE \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
|
||||
|
||||
/// Load tile configuration from a 64-byte memory location specified by
|
||||
/// "mem_addr". The tile configuration includes the tile type palette, the
|
||||
|
@ -31,9 +31,8 @@
|
|||
///
|
||||
/// \param __config
|
||||
/// A pointer to 512-bits configuration
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_tile_loadconfig(const void *__config)
|
||||
{
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TILE
|
||||
_tile_loadconfig(const void *__config) {
|
||||
__builtin_ia32_tile_loadconfig(__config);
|
||||
}
|
||||
|
||||
|
@ -48,9 +47,8 @@ _tile_loadconfig(const void *__config)
|
|||
///
|
||||
/// \param __config
|
||||
/// A pointer to 512-bits configuration
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_tile_storeconfig(void *__config)
|
||||
{
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TILE
|
||||
_tile_storeconfig(void *__config) {
|
||||
__builtin_ia32_tile_storeconfig(__config);
|
||||
}
|
||||
|
||||
|
@ -60,9 +58,7 @@ _tile_storeconfig(void *__config)
|
|||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_tile_release(void)
|
||||
{
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
||||
__builtin_ia32_tilerelease();
|
||||
}
|
||||
|
||||
|
@ -80,8 +76,9 @@ _tile_release(void)
|
|||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
#define _tile_loadd(dst, base, stride) \
|
||||
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
|
||||
#define _tile_loadd(dst, base, stride) \
|
||||
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
|
||||
(__SIZE_TYPE__)(stride))
|
||||
|
||||
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
||||
/// destination tile "dst" using the tile configuration previously configured
|
||||
|
@ -99,8 +96,9 @@ _tile_release(void)
|
|||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
#define _tile_stream_loadd(dst, base, stride) \
|
||||
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
|
||||
#define _tile_stream_loadd(dst, base, stride) \
|
||||
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
|
||||
(__SIZE_TYPE__)(stride))
|
||||
|
||||
/// Store the tile specified by "src" to memory specifieid by "base" address and
|
||||
/// "stride" using the tile configuration previously configured via
|
||||
|
@ -116,7 +114,7 @@ _tile_release(void)
|
|||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be stored in memory.
|
||||
#define _tile_stored(dst, base, stride) \
|
||||
#define _tile_stored(dst, base, stride) \
|
||||
__builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
|
||||
|
||||
/// Zero the tile specified by "tdest".
|
||||
|
@ -145,7 +143,8 @@ _tile_release(void)
|
|||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
|
||||
#define _tile_dpbssd(dst, src0, src1) \
|
||||
__builtin_ia32_tdpbssd((dst), (src0), (src1))
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
||||
|
@ -163,7 +162,8 @@ _tile_release(void)
|
|||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
|
||||
#define _tile_dpbsud(dst, src0, src1) \
|
||||
__builtin_ia32_tdpbsud((dst), (src0), (src1))
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
||||
|
@ -181,7 +181,8 @@ _tile_release(void)
|
|||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
|
||||
#define _tile_dpbusd(dst, src0, src1) \
|
||||
__builtin_ia32_tdpbusd((dst), (src0), (src1))
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
||||
|
@ -199,7 +200,8 @@ _tile_release(void)
|
|||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
|
||||
#define _tile_dpbuud(dst, src0, src1) \
|
||||
__builtin_ia32_tdpbuud((dst), (src0), (src1))
|
||||
|
||||
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
|
||||
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
||||
|
@ -216,10 +218,56 @@ _tile_release(void)
|
|||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbf16ps(dst, src0, src1) \
|
||||
#define _tile_dpbf16ps(dst, src0, src1) \
|
||||
__builtin_ia32_tdpbf16ps((dst), (src0), (src1))
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#define __DEFAULT_FN_ATTRS_INT8 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
|
||||
|
||||
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
return __builtin_ia32_tileloadd64_internal(m, n, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
|
||||
__SIZE_TYPE__ stride, _tile1024i tile) {
|
||||
return __builtin_ia32_tilestored64_internal(m, n, base,
|
||||
(__SIZE_TYPE__)(stride), tile);
|
||||
}
|
||||
|
||||
typedef struct __tile1024i_str {
|
||||
const unsigned short row;
|
||||
const unsigned short col;
|
||||
_tile1024i tile;
|
||||
} __tile1024i;
|
||||
|
||||
__DEFAULT_FN_ATTRS_INT8
|
||||
static void __tile_loadd(__tile1024i *dst, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
|
||||
}
|
||||
|
||||
__DEFAULT_FN_ATTRS_INT8
|
||||
static void __tile_dpbsud(__tile1024i *dst, __tile1024i src1,
|
||||
__tile1024i src2) {
|
||||
dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
|
||||
src1.tile, src2.tile);
|
||||
}
|
||||
|
||||
__DEFAULT_FN_ATTRS_INT8
|
||||
static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
|
||||
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
|
||||
}
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMXINTRIN_H */
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f -target-feature +amx-int8 \
|
||||
// RUN: -target-feature +amx-bf16 -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
char buf[1024];
|
||||
#define STRIDE 32
|
||||
|
||||
char buf2[1024];
|
||||
|
||||
// This is an example code and integration test.
|
||||
void test_api(int cond, short row, short col) {
|
||||
//CHECK-LABEL: @test_api
|
||||
//CHECK: call <256 x i32> @llvm.x86.tileloadd64.internal
|
||||
//CHECK: call <256 x i32> @llvm.x86.tdpbssd.internal
|
||||
//CHECK: call void @llvm.x86.tilestored64.internal
|
||||
__tile1024i a = {row, 8};
|
||||
__tile1024i b = {8, col};
|
||||
__tile1024i c = {row, col};
|
||||
|
||||
if (cond) {
|
||||
__tile_loadd(&a, buf, STRIDE);
|
||||
__tile_loadd(&b, buf, STRIDE);
|
||||
__tile_loadd(&c, buf, STRIDE);
|
||||
} else {
|
||||
__tile_loadd(&a, buf2, STRIDE);
|
||||
__tile_loadd(&b, buf2, STRIDE);
|
||||
__tile_loadd(&c, buf2, STRIDE);
|
||||
}
|
||||
__tile_dpbsud(&c, a, b);
|
||||
__tile_stored(buf, STRIDE, c);
|
||||
}
|
||||
|
||||
void test_tile_loadd(short row, short col) {
|
||||
//CHECK-LABEL: @test_tile_loadd
|
||||
//CHECK: call <256 x i32> @llvm.x86.tileloadd64.internal
|
||||
__tile1024i a = {row, col};
|
||||
__tile_loadd(&a, buf, STRIDE);
|
||||
}
|
||||
|
||||
void test_tile_dpbsud(__tile1024i a, __tile1024i b, __tile1024i c) {
|
||||
//CHECK-LABEL: @test_tile_dpbsud
|
||||
//CHECK: call <256 x i32> @llvm.x86.tdpbssd.internal
|
||||
__tile_dpbsud(&c, a, b);
|
||||
}
|
||||
|
||||
void test_tile_stored(__tile1024i c) {
|
||||
//CHECK-LABEL: @test_tile_stored
|
||||
//CHECK: call void @llvm.x86.tilestored64.internal
|
||||
__tile_stored(buf, STRIDE, c);
|
||||
}
|
|
@ -104,6 +104,9 @@ public:
|
|||
void verify(LiveVirtRegBitSet& VisitedVRegs);
|
||||
#endif
|
||||
|
||||
// Get any virtual register that is assign to this physical unit
|
||||
LiveInterval *getOneVReg() const;
|
||||
|
||||
/// Query interferences between a single live virtual register and a live
|
||||
/// interval union.
|
||||
class Query {
|
||||
|
|
|
@ -153,6 +153,8 @@ public:
|
|||
/// Directly access the live interval unions per regunit.
|
||||
/// This returns an array indexed by the regunit number.
|
||||
LiveIntervalUnion *getLiveUnions() { return &Matrix[0]; }
|
||||
|
||||
Register getOneVReg(unsigned PhysReg) const;
|
||||
};
|
||||
|
||||
} // end namespace llvm
|
||||
|
|
|
@ -485,6 +485,10 @@ namespace llvm {
|
|||
/// The pass fixups statepoint machine instruction to replace usage of
|
||||
/// caller saved registers with stack slots.
|
||||
extern char &FixupStatepointCallerSavedID;
|
||||
|
||||
/// The pass transform load/store <256 x i32> to AMX load/store intrinsics
|
||||
/// or split the data to two <128 x i32>.
|
||||
FunctionPass *createX86LowerAMXTypePass();
|
||||
} // End llvm namespace
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,107 @@
|
|||
//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file Shape utility for AMX.
|
||||
/// AMX hardware requires to config the shape of tile data register before use.
|
||||
/// The 2D shape includes row and column. In AMX intrinsics interface the shape
|
||||
/// is passed as 1st and 2nd parameter and they are lowered as the 1st and 2nd
|
||||
/// machine operand of AMX pseudo instructions. ShapeT class is to facilitate
|
||||
/// tile config and register allocator. The row and column are machine operand
|
||||
/// of AMX pseudo instructions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_CODEGEN_TILESHAPEINFO_H
|
||||
#define LLVM_CODEGEN_TILESHAPEINFO_H
|
||||
|
||||
#include "llvm/ADT/DenseMapInfo.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineOperand.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/Register.h"
|
||||
#include <utility>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace llvm {
|
||||
|
||||
class ShapeT {
|
||||
public:
|
||||
ShapeT(MachineOperand *Row, MachineOperand *Col,
|
||||
const MachineRegisterInfo *MRI = nullptr)
|
||||
: Row(Row), Col(Col) {
|
||||
if (MRI)
|
||||
deduceImm(MRI);
|
||||
}
|
||||
ShapeT()
|
||||
: Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
|
||||
ColImm(InvalidImmShape) {}
|
||||
bool operator==(const ShapeT &Shape) {
|
||||
MachineOperand *R = Shape.Row;
|
||||
MachineOperand *C = Shape.Col;
|
||||
if (!R || !C)
|
||||
return false;
|
||||
if (!Row || !Col)
|
||||
return false;
|
||||
if (Row->getReg() == R->getReg() && Col->getReg() == C->getReg())
|
||||
return true;
|
||||
if ((RowImm != InvalidImmShape) && (ColImm != InvalidImmShape))
|
||||
return RowImm == Shape.getRowImm() && ColImm == Shape.getColImm();
|
||||
return false;
|
||||
}
|
||||
|
||||
bool operator!=(const ShapeT &Shape) { return !(*this == Shape); }
|
||||
|
||||
ShapeT &operator=(const ShapeT &RHS) {
|
||||
Row = RHS.Row;
|
||||
Col = RHS.Col;
|
||||
RowImm = RHS.RowImm;
|
||||
ColImm = RHS.ColImm;
|
||||
return *this;
|
||||
}
|
||||
|
||||
MachineOperand *getRow() const { return Row; }
|
||||
|
||||
MachineOperand *getCol() const { return Col; }
|
||||
|
||||
int64_t getRowImm() const { return RowImm; }
|
||||
|
||||
int64_t getColImm() const { return ColImm; }
|
||||
|
||||
bool isValid() { return (Row != nullptr) && (Col != nullptr); }
|
||||
|
||||
void deduceImm(const MachineRegisterInfo *MRI) {
|
||||
// All def must be the same value, otherwise it is invalid MIs.
|
||||
// Find the immediate.
|
||||
// TODO copy propagation.
|
||||
auto GetImm = [&](Register Reg) {
|
||||
int64_t Imm = InvalidImmShape;
|
||||
for (const MachineOperand &DefMO : MRI->def_operands(Reg)) {
|
||||
const auto *MI = DefMO.getParent();
|
||||
if (MI->isMoveImmediate()) {
|
||||
Imm = MI->getOperand(1).getImm();
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Imm;
|
||||
};
|
||||
RowImm = GetImm(Row->getReg());
|
||||
ColImm = GetImm(Col->getReg());
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr int64_t InvalidImmShape = -1;
|
||||
MachineOperand *Row;
|
||||
MachineOperand *Col;
|
||||
int64_t RowImm;
|
||||
int64_t ColImm;
|
||||
};
|
||||
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
|
@ -19,6 +19,7 @@
|
|||
#include "llvm/ADT/IndexedMap.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
||||
#include "llvm/CodeGen/TileShapeInfo.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include <cassert>
|
||||
|
||||
|
@ -60,6 +61,10 @@ class TargetInstrInfo;
|
|||
/// mapping.
|
||||
IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2SplitMap;
|
||||
|
||||
/// Virt2ShapeMap - For X86 AMX register whose register is bound shape
|
||||
/// information.
|
||||
DenseMap<unsigned, ShapeT> Virt2ShapeMap;
|
||||
|
||||
/// createSpillSlot - Allocate a spill slot for RC from MFI.
|
||||
unsigned createSpillSlot(const TargetRegisterClass *RC);
|
||||
|
||||
|
@ -107,6 +112,21 @@ class TargetInstrInfo;
|
|||
/// the specified physical register
|
||||
void assignVirt2Phys(Register virtReg, MCPhysReg physReg);
|
||||
|
||||
bool isShapeMapEmpty() const { return Virt2ShapeMap.empty(); }
|
||||
|
||||
bool hasShape(Register virtReg) const {
|
||||
return getShape(virtReg).isValid();
|
||||
}
|
||||
|
||||
ShapeT getShape(Register virtReg) const {
|
||||
assert(virtReg.isVirtual());
|
||||
return Virt2ShapeMap.lookup(virtReg);
|
||||
}
|
||||
|
||||
void assignVirt2Shape(Register virtReg, ShapeT shape) {
|
||||
Virt2ShapeMap[virtReg.id()] = shape;
|
||||
}
|
||||
|
||||
/// clears the specified virtual register's, physical
|
||||
/// register mapping
|
||||
void clearVirt(Register virtReg) {
|
||||
|
@ -133,6 +153,9 @@ class TargetInstrInfo;
|
|||
/// records virtReg is a split live interval from SReg.
|
||||
void setIsSplitFromReg(Register virtReg, unsigned SReg) {
|
||||
Virt2SplitMap[virtReg.id()] = SReg;
|
||||
if (hasShape(SReg)) {
|
||||
Virt2ShapeMap[virtReg.id()] = getShape(SReg);
|
||||
}
|
||||
}
|
||||
|
||||
/// returns the live interval virtReg is split from.
|
||||
|
|
|
@ -292,6 +292,7 @@ def llvm_v8i32_ty : LLVMType<v8i32>; // 8 x i32
|
|||
def llvm_v16i32_ty : LLVMType<v16i32>; // 16 x i32
|
||||
def llvm_v32i32_ty : LLVMType<v32i32>; // 32 x i32
|
||||
def llvm_v64i32_ty : LLVMType<v64i32>; // 64 x i32
|
||||
def llvm_v256i32_ty : LLVMType<v256i32>; //256 x i32
|
||||
|
||||
def llvm_v1i64_ty : LLVMType<v1i64>; // 1 x i64
|
||||
def llvm_v2i64_ty : LLVMType<v2i64>; // 2 x i64
|
||||
|
|
|
@ -5056,4 +5056,20 @@ let TargetPrefix = "x86" in {
|
|||
Intrinsic<[llvm_i8_ty], [], []>;
|
||||
def int_x86_senduipi : GCCBuiltin<"__builtin_ia32_senduipi">,
|
||||
Intrinsic<[], [llvm_i64_ty], []>;
|
||||
// AMX - internal intrinsics
|
||||
def int_x86_tileloadd64_internal :
|
||||
GCCBuiltin<"__builtin_ia32_tileloadd64_internal">,
|
||||
Intrinsic<[llvm_v256i32_ty],
|
||||
[llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
|
||||
[]>;
|
||||
def int_x86_tdpbssd_internal :
|
||||
GCCBuiltin<"__builtin_ia32_tdpbssd_internal">,
|
||||
Intrinsic<[llvm_v256i32_ty],
|
||||
[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
|
||||
llvm_v256i32_ty, llvm_v256i32_ty,
|
||||
llvm_v256i32_ty], []>;
|
||||
def int_x86_tilestored64_internal :
|
||||
GCCBuiltin<"__builtin_ia32_tilestored64_internal">,
|
||||
Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty,
|
||||
llvm_i64_ty, llvm_v256i32_ty], []>;
|
||||
}
|
||||
|
|
|
@ -1558,4 +1558,6 @@ void HoistSpillHelper::LRE_DidCloneVirtReg(Register New, Register Old) {
|
|||
VRM.assignVirt2StackSlot(New, VRM.getStackSlot(Old));
|
||||
else
|
||||
llvm_unreachable("VReg should be assigned either physreg or stackslot");
|
||||
if (VRM.hasShape(Old))
|
||||
VRM.assignVirt2Shape(New, VRM.getShape(Old));
|
||||
}
|
||||
|
|
|
@ -99,6 +99,16 @@ void LiveIntervalUnion::verify(LiveVirtRegBitSet& VisitedVRegs) {
|
|||
}
|
||||
#endif //!NDEBUG
|
||||
|
||||
LiveInterval *LiveIntervalUnion::getOneVReg() const {
|
||||
if (empty())
|
||||
return nullptr;
|
||||
for (LiveSegments::const_iterator SI = Segments.begin(); SI.valid(); ++SI) {
|
||||
// return the first valid live interval
|
||||
return SI.value();
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Scan the vector of interfering virtual registers in this union. Assume it's
|
||||
// quite small.
|
||||
bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const {
|
||||
|
|
|
@ -221,3 +221,13 @@ bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End,
|
|||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Register LiveRegMatrix::getOneVReg(unsigned PhysReg) const {
|
||||
LiveInterval *VRegInterval = nullptr;
|
||||
for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) {
|
||||
if ((VRegInterval = Matrix[*Unit].getOneVReg()))
|
||||
return VRegInterval->reg();
|
||||
}
|
||||
|
||||
return MCRegister::NoRegister;
|
||||
}
|
||||
|
|
|
@ -68,6 +68,7 @@ bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
|
|||
Virt2PhysMap.clear();
|
||||
Virt2StackSlotMap.clear();
|
||||
Virt2SplitMap.clear();
|
||||
Virt2ShapeMap.clear();
|
||||
|
||||
grow();
|
||||
return false;
|
||||
|
|
|
@ -32,6 +32,9 @@ set(sources
|
|||
X86CmovConversion.cpp
|
||||
X86DomainReassignment.cpp
|
||||
X86DiscriminateMemOps.cpp
|
||||
X86LowerAMXType.cpp
|
||||
X86TileConfig.cpp
|
||||
X86PreTileConfig.cpp
|
||||
X86ExpandPseudo.cpp
|
||||
X86FastISel.cpp
|
||||
X86FixupBWInsts.cpp
|
||||
|
|
|
@ -76,6 +76,10 @@ FunctionPass *createX86FlagsCopyLoweringPass();
|
|||
/// Return a pass that expands WinAlloca pseudo-instructions.
|
||||
FunctionPass *createX86WinAllocaExpander();
|
||||
|
||||
FunctionPass *createX86TileConfigPass();
|
||||
|
||||
FunctionPass *createX86PreTileConfigPass();
|
||||
|
||||
/// Return a pass that inserts int3 at the end of the function if it ends with a
|
||||
/// CALL instruction. The pass does the same for each funclet as well. This
|
||||
/// ensures that the open interval of function start and end PCs contains all
|
||||
|
@ -162,6 +166,9 @@ void initializeX86OptimizeLEAPassPass(PassRegistry &);
|
|||
void initializeX86PartialReductionPass(PassRegistry &);
|
||||
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
|
||||
void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
|
||||
void initializeX86PreTileConfigPass(PassRegistry &);
|
||||
void initializeX86TileConfigPass(PassRegistry &);
|
||||
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
|
||||
|
||||
namespace X86AS {
|
||||
enum : unsigned {
|
||||
|
|
|
@ -461,6 +461,39 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
|
|||
case TargetOpcode::ICALL_BRANCH_FUNNEL:
|
||||
ExpandICallBranchFunnel(&MBB, MBBI);
|
||||
return true;
|
||||
case X86::PLDTILECFG: {
|
||||
MI.RemoveOperand(0);
|
||||
MI.setDesc(TII->get(X86::LDTILECFG));
|
||||
return true;
|
||||
}
|
||||
case X86::PSTTILECFG: {
|
||||
MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
|
||||
MI.setDesc(TII->get(X86::STTILECFG));
|
||||
return true;
|
||||
}
|
||||
case X86::PTILELOADDV: {
|
||||
MI.RemoveOperand(8); // Remove $tmmcfg
|
||||
for (unsigned i = 2; i > 0; --i)
|
||||
MI.RemoveOperand(i);
|
||||
MI.setDesc(TII->get(X86::TILELOADD));
|
||||
return true;
|
||||
}
|
||||
case X86::PTDPBSSDV: {
|
||||
MI.RemoveOperand(7); // Remove $tmmcfg
|
||||
MI.untieRegOperand(4);
|
||||
for (unsigned i = 3; i > 0; --i)
|
||||
MI.RemoveOperand(i);
|
||||
MI.setDesc(TII->get(X86::TDPBSSD));
|
||||
MI.tieOperands(0, 1);
|
||||
return true;
|
||||
}
|
||||
case X86::PTILESTOREDV: {
|
||||
MI.RemoveOperand(8); // Remove $tmmcfg
|
||||
for (int i = 1; i >= 0; --i)
|
||||
MI.RemoveOperand(i);
|
||||
MI.setDesc(TII->get(X86::TILESTORED));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
llvm_unreachable("Previous switch has a fallthrough?");
|
||||
}
|
||||
|
|
|
@ -2091,6 +2091,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
|
|||
emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
|
||||
}
|
||||
}
|
||||
|
||||
// Emit tilerelease for AMX kernel.
|
||||
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
if (!MRI.reg_nodbg_empty(X86::TMMCFG))
|
||||
BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
|
||||
}
|
||||
|
||||
StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
|
||||
|
|
|
@ -4572,6 +4572,49 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
|||
ReplaceNode(Node, Res);
|
||||
return;
|
||||
}
|
||||
case Intrinsic::x86_tileloadd64_internal: {
|
||||
if (!Subtarget->hasAMXTILE())
|
||||
break;
|
||||
unsigned Opc = X86::PTILELOADDV;
|
||||
// _tile_loadd_internal(row, col, buf, STRIDE)
|
||||
SDValue Base = Node->getOperand(4);
|
||||
SDValue Scale = getI8Imm(1, dl);
|
||||
SDValue Index = Node->getOperand(5);
|
||||
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
|
||||
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
|
||||
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
|
||||
SDValue Chain = Node->getOperand(0);
|
||||
MachineSDNode *CNode;
|
||||
SDValue Ops[] = {Node->getOperand(2),
|
||||
Node->getOperand(3),
|
||||
Base,
|
||||
Scale,
|
||||
Index,
|
||||
Disp,
|
||||
Segment,
|
||||
CFG,
|
||||
Chain};
|
||||
CNode = CurDAG->getMachineNode(Opc, dl, {MVT::v256i32, MVT::Other}, Ops);
|
||||
ReplaceNode(Node, CNode);
|
||||
return;
|
||||
}
|
||||
case Intrinsic::x86_tdpbssd_internal: {
|
||||
if (!Subtarget->hasAMXTILE())
|
||||
break;
|
||||
unsigned Opc = X86::PTDPBSSDV;
|
||||
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
|
||||
SDValue Ops[] = {Node->getOperand(2),
|
||||
Node->getOperand(3),
|
||||
Node->getOperand(4),
|
||||
Node->getOperand(5),
|
||||
Node->getOperand(6),
|
||||
Node->getOperand(7),
|
||||
CFG};
|
||||
MachineSDNode *CNode =
|
||||
CurDAG->getMachineNode(Opc, dl, {MVT::v256i32, MVT::Other}, Ops);
|
||||
ReplaceNode(Node, CNode);
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -4629,6 +4672,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
|||
|
||||
break;
|
||||
}
|
||||
case Intrinsic::x86_tilestored64_internal: {
|
||||
unsigned Opc = X86::PTILESTOREDV;
|
||||
// _tile_stored_internal(row, col, buf, STRIDE, c)
|
||||
SDValue Base = Node->getOperand(4);
|
||||
SDValue Scale = getI8Imm(1, dl);
|
||||
SDValue Index = Node->getOperand(5);
|
||||
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
|
||||
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
|
||||
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
|
||||
SDValue Chain = Node->getOperand(0);
|
||||
MachineSDNode *CNode;
|
||||
SDValue Ops[] = {Node->getOperand(2),
|
||||
Node->getOperand(3),
|
||||
Base,
|
||||
Scale,
|
||||
Index,
|
||||
Disp,
|
||||
Segment,
|
||||
Node->getOperand(6),
|
||||
CFG,
|
||||
Chain};
|
||||
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
|
||||
ReplaceNode(Node, CNode);
|
||||
return;
|
||||
}
|
||||
case Intrinsic::x86_tileloadd64:
|
||||
case Intrinsic::x86_tileloaddt164:
|
||||
case Intrinsic::x86_tilestored64: {
|
||||
|
|
|
@ -1897,6 +1897,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
|
||||
}
|
||||
|
||||
if (Subtarget.hasAMXTILE()) {
|
||||
addRegisterClass(MVT::v256i32, &X86::TILERegClass);
|
||||
}
|
||||
|
||||
// We want to custom lower some of our intrinsics.
|
||||
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
|
||||
|
@ -5341,6 +5345,12 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
|
|||
// width.
|
||||
if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
|
||||
return false;
|
||||
|
||||
// Don't merge to x86 amx tile, as we only map MVT::v256i32
|
||||
// to x86 amx tile on amx intrinsics.
|
||||
if (MemVT == MVT::v256i32)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -16,17 +16,21 @@
|
|||
|
||||
let Predicates = [HasAMXTILE, In64BitMode] in {
|
||||
let SchedRW = [WriteSystem] in {
|
||||
let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
|
||||
let hasSideEffects = 1,
|
||||
Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
|
||||
def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
|
||||
"ldtilecfg\t$src",
|
||||
[(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
|
||||
let hasSideEffects = 1 in
|
||||
def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
|
||||
"sttilecfg\t$src",
|
||||
[(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
|
||||
let mayLoad = 1 in
|
||||
def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
|
||||
(ins sibmem:$src),
|
||||
"tileloadd\t{$src, $dst|$dst, $src}", []>,
|
||||
VEX, T8XD;
|
||||
let mayLoad = 1 in
|
||||
def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
|
||||
(ins sibmem:$src),
|
||||
"tileloaddt1\t{$src, $dst|$dst, $src}", []>,
|
||||
|
@ -34,6 +38,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
|
|||
let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
|
||||
def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
|
||||
"tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
|
||||
let mayStore = 1 in
|
||||
def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
|
||||
(ins sibmem:$dst, TILE:$src),
|
||||
"tilestored\t{$src, $dst|$dst, $src}", []>,
|
||||
|
@ -42,6 +47,22 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
|
|||
"tilezero\t$dst", []>,
|
||||
VEX, T8XD;
|
||||
|
||||
// Pseduo instruction for RA.
|
||||
let hasSideEffects = 1, mayLoad = 1,
|
||||
Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
|
||||
def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
|
||||
|
||||
let hasSideEffects = 1, mayStore = 1 in
|
||||
def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
|
||||
|
||||
def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
|
||||
GR16:$src2,
|
||||
opaquemem:$src3,
|
||||
TILECFG:$cfg), []>;
|
||||
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
|
||||
GR16:$src2, opaquemem:$src3,
|
||||
TILE:$src4, TILECFG:$cfg), []>;
|
||||
|
||||
let usesCustomInserter = 1 in {
|
||||
// Pseudo instructions, using immediates instead of tile registers.
|
||||
// To be translated to the actual instructions in X86ISelLowering.cpp
|
||||
|
@ -76,6 +97,12 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
|
|||
VEX_4V, T8PS;
|
||||
}
|
||||
|
||||
// Pseduo instruction for RA.
|
||||
let Constraints = "$src4 = $dst" in
|
||||
def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
|
||||
GR16:$src2, GR16:$src3, TILE:$src4,
|
||||
TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
|
||||
|
||||
let usesCustomInserter = 1 in {
|
||||
// Pseudo instructions, using immediates instead of tile registers.
|
||||
// To be translated to the actual instructions in X86ISelLowering.cpp
|
||||
|
|
|
@ -3796,13 +3796,31 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
|
|||
const MachineFunction &MF = *MBB.getParent();
|
||||
assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
|
||||
"Stack slot too small for store");
|
||||
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
|
||||
bool isAligned =
|
||||
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
|
||||
RI.canRealignStack(MF);
|
||||
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
|
||||
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
|
||||
.addReg(SrcReg, getKillRegState(isKill));
|
||||
if (RC->getID() == X86::TILERegClassID) {
|
||||
unsigned Opc = X86::TILESTORED;
|
||||
// tilestored %tmm, (%sp, %idx)
|
||||
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
|
||||
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
|
||||
BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
|
||||
MachineInstr *NewMI =
|
||||
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
|
||||
.addReg(SrcReg, getKillRegState(isKill));
|
||||
MachineOperand &MO = NewMI->getOperand(2);
|
||||
MO.setReg(VirtReg);
|
||||
MO.setIsKill(true);
|
||||
} else if (RC->getID() == X86::TILECFGRegClassID) {
|
||||
unsigned Opc = X86::PSTTILECFG;
|
||||
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
|
||||
.addReg(SrcReg, getKillRegState(isKill));
|
||||
} else {
|
||||
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
|
||||
bool isAligned =
|
||||
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
|
||||
RI.canRealignStack(MF);
|
||||
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
|
||||
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
|
||||
.addReg(SrcReg, getKillRegState(isKill));
|
||||
}
|
||||
}
|
||||
|
||||
void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
|
||||
|
@ -3810,13 +3828,32 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
|
|||
Register DestReg, int FrameIdx,
|
||||
const TargetRegisterClass *RC,
|
||||
const TargetRegisterInfo *TRI) const {
|
||||
const MachineFunction &MF = *MBB.getParent();
|
||||
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
|
||||
bool isAligned =
|
||||
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
|
||||
RI.canRealignStack(MF);
|
||||
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
|
||||
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx);
|
||||
if (RC->getID() == X86::TILERegClassID) {
|
||||
unsigned Opc = X86::TILELOADD;
|
||||
// tileloadd (%sp, %idx), %tmm
|
||||
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
|
||||
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
|
||||
MachineInstr *NewMI =
|
||||
BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
|
||||
NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
|
||||
FrameIdx);
|
||||
MachineOperand &MO = NewMI->getOperand(3);
|
||||
MO.setReg(VirtReg);
|
||||
MO.setIsKill(true);
|
||||
} else if (RC->getID() == X86::TILECFGRegClassID) {
|
||||
unsigned Opc = X86::PLDTILECFG;
|
||||
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
|
||||
FrameIdx);
|
||||
} else {
|
||||
const MachineFunction &MF = *MBB.getParent();
|
||||
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
|
||||
bool isAligned =
|
||||
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
|
||||
RI.canRealignStack(MF);
|
||||
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
|
||||
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
|
||||
FrameIdx);
|
||||
}
|
||||
}
|
||||
|
||||
bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
|
||||
|
|
|
@ -0,0 +1,294 @@
|
|||
//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file Pass to transform <256 x i32>
|
||||
/// <256 x i32> is mapped to AMX tile register on X86, AMX instruction set only
|
||||
/// provides simple operation on tile register. The basic elementwise operation
|
||||
/// is not supported by AMX. Since we define the AMX tile as vector <256 x i32>
|
||||
/// and only AMX intrinsics can operate on the type, we need transform
|
||||
/// load/store <256 x i32> instruction to AMX load/store. Besides, we split
|
||||
/// <256 x i32> to 2 <128 x i32> if the vector is not used or defined by AMX
|
||||
/// intrinsics, so that in instruction selection it can be lowered to proper
|
||||
/// size which HW can support.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
#include "X86.h"
|
||||
#include "llvm/ADT/DenseSet.h"
|
||||
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
#include "llvm/CodeGen/ValueTypes.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/IntrinsicInst.h"
|
||||
#include "llvm/IR/IntrinsicsX86.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/Pass.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "lower-amx-type"
|
||||
|
||||
namespace {
|
||||
class X86LowerAMXType {
|
||||
Function &Func;
|
||||
const DataLayout &DL;
|
||||
DenseSet<Instruction *> LDSet;
|
||||
DenseSet<Instruction *> STSet;
|
||||
DenseMap<Value *, std::pair<LoadInst *, LoadInst *>> LoadMap;
|
||||
|
||||
public:
|
||||
X86LowerAMXType(Function &F) : Func(F), DL(F.getParent()->getDataLayout()) {}
|
||||
bool visit();
|
||||
bool visitLD();
|
||||
bool visitST();
|
||||
void splitST(Instruction *Inst);
|
||||
void splitLD(Instruction *Inst);
|
||||
};
|
||||
|
||||
// Split v256i32 load/store to 2 v128i32, so that ISel can
|
||||
// lower it to proper vector size.
|
||||
void X86LowerAMXType::splitST(Instruction *Inst) {
|
||||
StoreInst *ST = dyn_cast<StoreInst>(Inst);
|
||||
IRBuilder<> Builder(ST);
|
||||
LLVMContext &Ctx = Builder.getContext();
|
||||
Type *Ty = ST->getValueOperand()->getType();
|
||||
EVT VT = EVT::getEVT(Ty);
|
||||
EVT HalfVT = VT.getHalfNumVectorElementsVT(Ctx);
|
||||
Type *HalfTy = HalfVT.getTypeForEVT(Ctx);
|
||||
|
||||
LoadInst *Lo, *Hi;
|
||||
std::tie(Lo, Hi) = LoadMap[ST->getValueOperand()];
|
||||
Value *Ptr = ST->getPointerOperand();
|
||||
PointerType *HalfPtrTy = HalfTy->getPointerTo(ST->getPointerAddressSpace());
|
||||
Value *HalfPtr = Builder.CreateBitCast(Ptr, HalfPtrTy);
|
||||
// The HW require the alignment for AMX tile is 64, but front-end generate
|
||||
// code for the vector alignment which is the vector size.
|
||||
uint64_t HalfTySize = HalfTy->getPrimitiveSizeInBits().getFixedSize() / 8;
|
||||
Align Alignment = std::min(Lo->getAlign(), Align(HalfTySize));
|
||||
Builder.CreateAlignedStore(Lo, HalfPtr, Alignment, ST->isVolatile());
|
||||
|
||||
HalfPtr = Builder.CreateGEP(HalfTy, HalfPtr, Builder.getInt32(1));
|
||||
Builder.CreateAlignedStore(Hi, HalfPtr, Alignment, ST->isVolatile());
|
||||
}
|
||||
|
||||
bool X86LowerAMXType::visitST() {
|
||||
if (STSet.empty())
|
||||
return false;
|
||||
for (auto *Inst : STSet) {
|
||||
Value *Row, *Col;
|
||||
const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst->getOperand(0));
|
||||
if (!II)
|
||||
Row = Col = nullptr;
|
||||
else {
|
||||
switch (II->getIntrinsicID()) {
|
||||
default:
|
||||
Row = Col = nullptr;
|
||||
break;
|
||||
case Intrinsic::x86_tileloadd64_internal:
|
||||
case Intrinsic::x86_tdpbssd_internal: {
|
||||
Row = II->getArgOperand(0);
|
||||
Col = II->getArgOperand(1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!Row) {
|
||||
splitST(Inst);
|
||||
continue;
|
||||
}
|
||||
IRBuilder<> Builder(Inst);
|
||||
LLVMContext &Ctx = Builder.getContext();
|
||||
// Use the maximun column as stride. It must be the same with load stride.
|
||||
Value *Stride = Builder.getInt64(64);
|
||||
Value *I8Ptr =
|
||||
Builder.CreateBitCast(Inst->getOperand(1), Type::getInt8PtrTy(Ctx));
|
||||
std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride,
|
||||
Inst->getOperand(0)};
|
||||
|
||||
Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void X86LowerAMXType::splitLD(Instruction *Inst) {
|
||||
LoadInst *LD = dyn_cast<LoadInst>(Inst);
|
||||
IRBuilder<> Builder(LD);
|
||||
LLVMContext &Ctx = Builder.getContext();
|
||||
Type *Ty = LD->getType();
|
||||
EVT VT = EVT::getEVT(Ty);
|
||||
EVT HalfVT = VT.getHalfNumVectorElementsVT(Ctx);
|
||||
Type *HalfTy = HalfVT.getTypeForEVT(Ctx);
|
||||
|
||||
Value *Ptr = LD->getPointerOperand();
|
||||
PointerType *HalfPtrTy = HalfTy->getPointerTo(LD->getPointerAddressSpace());
|
||||
Value *HalfPtr = Builder.CreateBitCast(Ptr, HalfPtrTy);
|
||||
// The HW require the alignment for AMX tile is 64, but front-end generate
|
||||
// code for the vector alignment which is the vector size.
|
||||
uint64_t HalfTySize = HalfTy->getPrimitiveSizeInBits().getFixedSize() / 8;
|
||||
Align Alignment = std::min(LD->getAlign(), Align(HalfTySize));
|
||||
auto *Lo =
|
||||
Builder.CreateAlignedLoad(HalfTy, HalfPtr, Alignment, LD->isVolatile());
|
||||
|
||||
HalfPtr = Builder.CreateGEP(HalfTy, HalfPtr, Builder.getInt32(1));
|
||||
auto *Hi =
|
||||
Builder.CreateAlignedLoad(HalfTy, HalfPtr, Alignment, LD->isVolatile());
|
||||
|
||||
LoadMap[Inst] = std::make_pair(Lo, Hi);
|
||||
}
|
||||
|
||||
bool X86LowerAMXType::visitLD() {
|
||||
if (LDSet.empty())
|
||||
return false;
|
||||
for (auto &Inst : LDSet) {
|
||||
int Count = 0;
|
||||
Value *NewInst = nullptr;
|
||||
// The user should be all AMX intrinsics or all LLVM instruction.
|
||||
// Don't support it is used by both AMX intrinsics and LLVM instructions.
|
||||
for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) {
|
||||
Use &U = *I++;
|
||||
const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U.getUser());
|
||||
if (!II) {
|
||||
Count++;
|
||||
continue;
|
||||
}
|
||||
if (NewInst)
|
||||
continue;
|
||||
Value *Row, *Col;
|
||||
switch (II->getIntrinsicID()) {
|
||||
default:
|
||||
report_fatal_error("Non-AMX intrinsic use tile type.");
|
||||
break;
|
||||
case Intrinsic::x86_tdpbssd_internal: {
|
||||
unsigned OpNo = U.getOperandNo();
|
||||
switch (OpNo) {
|
||||
case 3:
|
||||
Row = II->getArgOperand(0);
|
||||
Col = II->getArgOperand(1);
|
||||
break;
|
||||
case 4:
|
||||
Row = II->getArgOperand(0);
|
||||
Col = II->getArgOperand(2);
|
||||
break;
|
||||
case 5:
|
||||
Row = II->getArgOperand(2);
|
||||
Col = II->getArgOperand(1);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Intrinsic::x86_tilestored64_internal: {
|
||||
Row = II->getArgOperand(0);
|
||||
Col = II->getArgOperand(1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(Count == 0 && "Can NOT mix amx intrinsic and LLVM instruction");
|
||||
// FIXME: The shape def should be ahead of load.
|
||||
IRBuilder<> Builder(Inst);
|
||||
LLVMContext &Ctx = Builder.getContext();
|
||||
// Use the maximun column as stride.
|
||||
Value *Stride = Builder.getInt64(64);
|
||||
Value *I8Ptr =
|
||||
Builder.CreateBitCast(Inst->getOperand(0), Type::getInt8PtrTy(Ctx));
|
||||
std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
|
||||
|
||||
NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal,
|
||||
None, Args);
|
||||
|
||||
Inst->replaceAllUsesWith(NewInst);
|
||||
}
|
||||
if (!NewInst)
|
||||
splitLD(Inst);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool X86LowerAMXType::visit() {
|
||||
bool C;
|
||||
auto IsAMXType = [](FixedVectorType *VTy) {
|
||||
if (!VTy)
|
||||
return false;
|
||||
if (!VTy->getScalarType()->isIntegerTy(32))
|
||||
return false;
|
||||
if (VTy->getNumElements() != 256)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
for (BasicBlock &BB : Func) {
|
||||
for (Instruction &Inst : BB) {
|
||||
LoadInst *LD = dyn_cast<LoadInst>(&Inst);
|
||||
// Check load instruction.
|
||||
// %3 = load <256 x i32>, <256 x i32>* %1, align 64
|
||||
if (LD) {
|
||||
FixedVectorType *VTy = dyn_cast<FixedVectorType>(Inst.getType());
|
||||
if (!IsAMXType(VTy))
|
||||
continue;
|
||||
LDSet.insert(&Inst);
|
||||
continue;
|
||||
}
|
||||
// Check store instruction.
|
||||
// store <256 x i32> %3, <256 x i32>* %2, align 64
|
||||
StoreInst *ST = dyn_cast<StoreInst>(&Inst);
|
||||
if (!ST)
|
||||
continue;
|
||||
FixedVectorType *VTy =
|
||||
dyn_cast<FixedVectorType>(ST->getOperand(0)->getType());
|
||||
if (!IsAMXType(VTy))
|
||||
continue;
|
||||
STSet.insert(&Inst);
|
||||
}
|
||||
}
|
||||
|
||||
C = visitLD() | visitST();
|
||||
for (auto *Inst : STSet)
|
||||
Inst->eraseFromParent();
|
||||
for (auto *Inst : LDSet)
|
||||
Inst->eraseFromParent();
|
||||
return C;
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
namespace {
|
||||
|
||||
class X86LowerAMXTypeLegacyPass : public FunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {
|
||||
initializeX86LowerAMXTypeLegacyPassPass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
bool runOnFunction(Function &F) override {
|
||||
X86LowerAMXType LAT(F);
|
||||
bool C = LAT.visit();
|
||||
return C;
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
}
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
static const char PassName[] = "Lower AMX type for load/store";
|
||||
char X86LowerAMXTypeLegacyPass::ID = 0;
|
||||
INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
|
||||
false)
|
||||
INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
|
||||
false)
|
||||
|
||||
FunctionPass *llvm::createX86LowerAMXTypePass() {
|
||||
return new X86LowerAMXTypeLegacyPass();
|
||||
}
|
|
@ -0,0 +1,263 @@
|
|||
//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file Pass to pre-config the shape of AMX register
|
||||
/// AMX register need to be configured before use. The shape of AMX register
|
||||
/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
|
||||
/// The pldtilecfg is to config tile registers. It should dominator all AMX
|
||||
/// instructions. The pldtilecfg produce a virtual cfg register and the cfg
|
||||
/// register is used by all AMX instructions.
|
||||
/// This pass is to find the common dominator of all AMX instructions and
|
||||
/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg
|
||||
/// produces is inserted as the last operand of each AMX instruction. We use
|
||||
/// this scheme to model the def-use relationship between AMX config instruction
|
||||
/// and other AMX instructions. Below is an example.
|
||||
///
|
||||
/// ----B1----
|
||||
/// / \
|
||||
/// / \
|
||||
/// B2 B3
|
||||
/// %1:tile = PTILELOADDV %2:tile = PTILELOADDV
|
||||
///
|
||||
/// is transformed to
|
||||
///
|
||||
/// B1
|
||||
/// %25:tilecfg = PLDTILECFG
|
||||
/// / \
|
||||
/// / \
|
||||
/// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "X86.h"
|
||||
#include "X86InstrBuilder.h"
|
||||
#include "X86RegisterInfo.h"
|
||||
#include "X86Subtarget.h"
|
||||
#include "llvm/CodeGen/MachineDominators.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
#include "llvm/CodeGen/TargetInstrInfo.h"
|
||||
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
||||
#include "llvm/CodeGen/TileShapeInfo.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "tile-pre-config"
|
||||
|
||||
namespace {
|
||||
|
||||
class X86PreTileConfig : public MachineFunctionPass {
|
||||
// context
|
||||
MachineFunction *MF = nullptr;
|
||||
const X86Subtarget *ST = nullptr;
|
||||
const TargetRegisterInfo *TRI;
|
||||
const TargetInstrInfo *TII;
|
||||
MachineDominatorTree *DomTree = nullptr;
|
||||
MachineRegisterInfo *MRI = nullptr;
|
||||
|
||||
MachineInstr *getTileConfigPoint();
|
||||
|
||||
public:
|
||||
X86PreTileConfig() : MachineFunctionPass(ID) {}
|
||||
|
||||
/// Return the pass name.
|
||||
StringRef getPassName() const override {
|
||||
return "Tile Register Pre-configure";
|
||||
}
|
||||
|
||||
/// X86PreTileConfig analysis usage.
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override;
|
||||
|
||||
/// Perform register allocation.
|
||||
bool runOnMachineFunction(MachineFunction &mf) override;
|
||||
|
||||
static char ID;
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
char X86PreTileConfig::ID = 0;
|
||||
|
||||
INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
|
||||
"Tile Register Configure", false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
|
||||
INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
|
||||
"Tile Register Configure", false, false)
|
||||
|
||||
void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
|
||||
AU.setPreservesAll();
|
||||
AU.addRequired<MachineDominatorTree>();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
|
||||
static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
|
||||
const TargetInstrInfo *TII,
|
||||
MachineRegisterInfo *MRI,
|
||||
const X86Subtarget *ST) {
|
||||
auto *MBB = MI->getParent();
|
||||
|
||||
// FIXME: AMX should assume AVX512 enabled.
|
||||
if (ST->hasAVX512()) {
|
||||
// Zero stack slot.
|
||||
Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
|
||||
BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
|
||||
.addReg(Zmm, RegState::Undef)
|
||||
.addReg(Zmm, RegState::Undef);
|
||||
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
|
||||
FrameIdx)
|
||||
.addReg(Zmm);
|
||||
}
|
||||
|
||||
// build psuedo ldtilecfg
|
||||
Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
|
||||
|
||||
addFrameReference(
|
||||
BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
|
||||
|
||||
return VReg;
|
||||
}
|
||||
|
||||
static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
|
||||
unsigned Opcode = MI.getOpcode();
|
||||
switch (Opcode) {
|
||||
default:
|
||||
llvm_unreachable("Unexpected machine instruction on tile");
|
||||
case X86::PTILELOADDV:
|
||||
case X86::PTDPBSSDV:
|
||||
MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
|
||||
MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
|
||||
ShapeT Shape(&MO1, &MO2, MRI);
|
||||
return Shape;
|
||||
}
|
||||
}
|
||||
|
||||
MachineInstr *X86PreTileConfig::getTileConfigPoint() {
|
||||
DenseMap<Register, ShapeT> PhysShapeInfo;
|
||||
MachineBasicBlock *MBB = nullptr;
|
||||
DenseSet<const MachineInstr *> MIs;
|
||||
for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
|
||||
Register VirtReg = Register::index2VirtReg(i);
|
||||
if (MRI->reg_nodbg_empty(VirtReg))
|
||||
continue;
|
||||
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
|
||||
if (RC.getID() != X86::TILERegClassID)
|
||||
continue;
|
||||
|
||||
// Find the common dominator for all MI that define tile register.
|
||||
for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
|
||||
if (MO.isUndef())
|
||||
continue;
|
||||
const auto *MI = MO.getParent();
|
||||
// PHI or IMPLICIT_DEF instructiion.
|
||||
// There must be a input tile before PHI instruction.
|
||||
if (MI->isTransient())
|
||||
continue;
|
||||
if (!MBB)
|
||||
MBB = const_cast<MachineBasicBlock *>(MI->getParent());
|
||||
MBB = DomTree->findNearestCommonDominator(
|
||||
MBB, const_cast<MachineBasicBlock *>(MI->getParent()));
|
||||
|
||||
// Collect the instructions that define shape.
|
||||
ShapeT Shape = getShape(*MI, MRI);
|
||||
std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(),
|
||||
Shape.getCol()};
|
||||
for (auto *ShapeMO : ShapeMOs) {
|
||||
Register ShapeReg = ShapeMO->getReg();
|
||||
for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) {
|
||||
const auto *ShapeMI = MO.getParent();
|
||||
MIs.insert(ShapeMI);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!MBB)
|
||||
return nullptr;
|
||||
// This pass is before the pass of eliminating PHI node, so it
|
||||
// is in SSA form.
|
||||
assert(MRI->isSSA() && "Not SSA form in pre-tile config");
|
||||
// Shape def should dominate tile config MBB.
|
||||
// def s s1 s2
|
||||
// / \ \ /
|
||||
// / \ \ /
|
||||
// conf s3=phi(s1,s2)
|
||||
// |
|
||||
// c
|
||||
//
|
||||
for (const auto *MI : MIs) {
|
||||
const MachineBasicBlock *ShapeMBB = MI->getParent();
|
||||
if (DomTree->dominates(ShapeMBB, MBB))
|
||||
continue;
|
||||
if (MI->isMoveImmediate())
|
||||
continue;
|
||||
report_fatal_error(MF->getName() + ": Failed to config tile register, "
|
||||
"please define the shape earlier");
|
||||
}
|
||||
|
||||
// ldtilecfg should be inserted after the MI that define the shape.
|
||||
MachineBasicBlock::reverse_instr_iterator I, E;
|
||||
for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) {
|
||||
auto *MI = &*I;
|
||||
if (MIs.count(MI) && (!MI->isMoveImmediate()))
|
||||
break;
|
||||
}
|
||||
MachineBasicBlock::iterator MII;
|
||||
if (I == E)
|
||||
MII = MBB->getFirstNonPHI();
|
||||
else {
|
||||
MII = MachineBasicBlock::iterator(&*I);
|
||||
MII++;
|
||||
}
|
||||
return &*MII;
|
||||
}
|
||||
|
||||
static void addTileCFGUse(MachineFunction &MF, Register CFG) {
|
||||
for (MachineBasicBlock &MBB : MF) {
|
||||
|
||||
// Traverse the basic block.
|
||||
for (MachineInstr &MI : MBB) {
|
||||
unsigned Opcode = MI.getOpcode();
|
||||
switch (Opcode) {
|
||||
default:
|
||||
break;
|
||||
case X86::PTILELOADDV:
|
||||
case X86::PTILESTOREDV:
|
||||
case X86::PTDPBSSDV:
|
||||
unsigned NumOperands = MI.getNumOperands();
|
||||
MI.RemoveOperand(NumOperands - 1);
|
||||
MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
|
||||
MF = &mf;
|
||||
MRI = &mf.getRegInfo();
|
||||
ST = &mf.getSubtarget<X86Subtarget>();
|
||||
TRI = ST->getRegisterInfo();
|
||||
TII = mf.getSubtarget().getInstrInfo();
|
||||
DomTree = &getAnalysis<MachineDominatorTree>();
|
||||
|
||||
MachineInstr *MI = getTileConfigPoint();
|
||||
if (!MI)
|
||||
return false;
|
||||
unsigned Size = ST->getTileConfigSize();
|
||||
Align Alignment = ST->getTileConfigAlignment();
|
||||
int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
|
||||
Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
|
||||
addTileCFGUse(mf, CFG);
|
||||
return true;
|
||||
}
|
||||
|
||||
FunctionPass *llvm::createX86PreTileConfigPass() {
|
||||
return new X86PreTileConfig();
|
||||
}
|
|
@ -19,6 +19,7 @@
|
|||
#include "llvm/ADT/BitVector.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/CodeGen/LiveRegMatrix.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
|
@ -856,3 +857,78 @@ X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const {
|
|||
StackReg = getX86SubSuperRegister(StackReg, 32);
|
||||
return StackReg;
|
||||
}
|
||||
|
||||
static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
|
||||
const MachineRegisterInfo *MRI) {
|
||||
if (VRM->hasShape(VirtReg))
|
||||
return VRM->getShape(VirtReg);
|
||||
|
||||
const MachineOperand &Def = *MRI->def_begin(VirtReg);
|
||||
MachineInstr *MI = const_cast<MachineInstr *>(Def.getParent());
|
||||
unsigned OpCode = MI->getOpcode();
|
||||
switch (OpCode) {
|
||||
default:
|
||||
llvm_unreachable("Unexpected machine instruction on tile register!");
|
||||
break;
|
||||
// We only collect the tile shape that is defined.
|
||||
case X86::PTILELOADDV:
|
||||
case X86::PTDPBSSDV:
|
||||
MachineOperand &MO1 = MI->getOperand(1);
|
||||
MachineOperand &MO2 = MI->getOperand(2);
|
||||
ShapeT Shape(&MO1, &MO2, MRI);
|
||||
VRM->assignVirt2Shape(VirtReg, Shape);
|
||||
return Shape;
|
||||
}
|
||||
}
|
||||
|
||||
bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
|
||||
ArrayRef<MCPhysReg> Order,
|
||||
SmallVectorImpl<MCPhysReg> &Hints,
|
||||
const MachineFunction &MF,
|
||||
const VirtRegMap *VRM,
|
||||
const LiveRegMatrix *Matrix) const {
|
||||
const MachineRegisterInfo *MRI = &MF.getRegInfo();
|
||||
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
|
||||
bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
|
||||
VirtReg, Order, Hints, MF, VRM, Matrix);
|
||||
|
||||
if (RC.getID() != X86::TILERegClassID)
|
||||
return BaseImplRetVal;
|
||||
|
||||
ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI);
|
||||
auto AddHint = [&](MCPhysReg PhysReg) {
|
||||
Register VReg = Matrix->getOneVReg(PhysReg);
|
||||
if (VReg == MCRegister::NoRegister) { // Not allocated yet
|
||||
Hints.push_back(PhysReg);
|
||||
return;
|
||||
}
|
||||
ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
|
||||
if (PhysShape == VirtShape)
|
||||
Hints.push_back(PhysReg);
|
||||
};
|
||||
|
||||
SmallSet<MCPhysReg, 4> CopyHints;
|
||||
CopyHints.insert(Hints.begin(), Hints.end());
|
||||
Hints.clear();
|
||||
for (auto Hint : CopyHints) {
|
||||
if (RC.contains(Hint) && !MRI->isReserved(Hint))
|
||||
AddHint(Hint);
|
||||
}
|
||||
for (MCPhysReg PhysReg : Order) {
|
||||
if (!CopyHints.count(PhysReg) && RC.contains(PhysReg) &&
|
||||
!MRI->isReserved(PhysReg))
|
||||
AddHint(PhysReg);
|
||||
}
|
||||
|
||||
#define DEBUG_TYPE "tile-hint"
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "Hints for virtual register " << format_hex(VirtReg, 8) << "\n";
|
||||
for (auto Hint : Hints) {
|
||||
dbgs() << "tmm" << Hint << ",";
|
||||
}
|
||||
dbgs() << "\n";
|
||||
});
|
||||
#undef DEBUG_TYPE
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -144,6 +144,11 @@ public:
|
|||
Register getFramePtr() const { return FramePtr; }
|
||||
// FIXME: Move to FrameInfok
|
||||
unsigned getSlotSize() const { return SlotSize; }
|
||||
|
||||
bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
|
||||
SmallVectorImpl<MCPhysReg> &Hints,
|
||||
const MachineFunction &MF, const VirtRegMap *VRM,
|
||||
const LiveRegMatrix *Matrix) const override;
|
||||
};
|
||||
|
||||
} // End llvm namespace
|
||||
|
|
|
@ -265,6 +265,9 @@ let SubRegIndices = [sub_ymm] in {
|
|||
}
|
||||
}
|
||||
|
||||
// Tile config registers.
|
||||
def TMMCFG: X86Reg<"tmmcfg", 0>;
|
||||
|
||||
// Tile "registers".
|
||||
def TMM0: X86Reg<"tmm0", 0>;
|
||||
def TMM1: X86Reg<"tmm1", 1>;
|
||||
|
@ -633,6 +636,11 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
|
|||
def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
|
||||
|
||||
// Tiles
|
||||
let isAllocatable = 0 in
|
||||
def TILE : RegisterClass<"X86", [untyped], 0,
|
||||
let CopyCost = -1 in // Don't allow copying of tile registers
|
||||
def TILE : RegisterClass<"X86", [v256i32], 8192,
|
||||
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
|
||||
def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
|
||||
let CopyCost = -1; // Don't allow copying of tile config registers.
|
||||
let isAllocatable = 1;
|
||||
let Size = 512;
|
||||
}
|
||||
|
|
|
@ -472,6 +472,8 @@ class X86Subtarget final : public X86GenSubtargetInfo {
|
|||
/// entry to the function and which must be maintained by every function.
|
||||
Align stackAlignment = Align(4);
|
||||
|
||||
Align TileConfigAlignment = Align(4);
|
||||
|
||||
/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
|
||||
///
|
||||
// FIXME: this is a known good value for Yonah. How about others?
|
||||
|
@ -555,6 +557,9 @@ public:
|
|||
return &getInstrInfo()->getRegisterInfo();
|
||||
}
|
||||
|
||||
unsigned getTileConfigSize() const { return 64; }
|
||||
Align getTileConfigAlignment() const { return TileConfigAlignment; }
|
||||
|
||||
/// Returns the minimum alignment known to hold of the
|
||||
/// stack frame on entry to the function and which must be maintained by every
|
||||
/// function for this subtarget.
|
||||
|
|
|
@ -62,6 +62,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
|
|||
RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());
|
||||
|
||||
PassRegistry &PR = *PassRegistry::getPassRegistry();
|
||||
initializeX86LowerAMXTypeLegacyPassPass(PR);
|
||||
initializeGlobalISel(PR);
|
||||
initializeWinEHStatePassPass(PR);
|
||||
initializeFixupBWInstPassPass(PR);
|
||||
|
@ -71,6 +72,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
|
|||
initializeX86FixupSetCCPassPass(PR);
|
||||
initializeX86CallFrameOptimizationPass(PR);
|
||||
initializeX86CmovConverterPassPass(PR);
|
||||
initializeX86TileConfigPass(PR);
|
||||
initializeX86ExpandPseudoPass(PR);
|
||||
initializeX86ExecutionDomainFixPass(PR);
|
||||
initializeX86DomainReassignmentPass(PR);
|
||||
|
@ -379,6 +381,7 @@ public:
|
|||
void addPreEmitPass() override;
|
||||
void addPreEmitPass2() override;
|
||||
void addPreSched2() override;
|
||||
bool addPreRewrite() override;
|
||||
|
||||
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
|
||||
};
|
||||
|
@ -407,6 +410,7 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
|
|||
|
||||
void X86PassConfig::addIRPasses() {
|
||||
addPass(createAtomicExpandPass());
|
||||
addPass(createX86LowerAMXTypePass());
|
||||
|
||||
TargetPassConfig::addIRPasses();
|
||||
|
||||
|
@ -492,7 +496,12 @@ void X86PassConfig::addPreRegAlloc() {
|
|||
addPass(createX86SpeculativeLoadHardeningPass());
|
||||
addPass(createX86FlagsCopyLoweringPass());
|
||||
addPass(createX86WinAllocaExpander());
|
||||
|
||||
if (getOptLevel() != CodeGenOpt::None) {
|
||||
addPass(createX86PreTileConfigPass());
|
||||
}
|
||||
}
|
||||
|
||||
void X86PassConfig::addMachineSSAOptimization() {
|
||||
addPass(createX86DomainReassignmentPass());
|
||||
TargetPassConfig::addMachineSSAOptimization();
|
||||
|
@ -565,6 +574,11 @@ void X86PassConfig::addPreEmitPass2() {
|
|||
addPass(createX86LoadValueInjectionRetHardeningPass());
|
||||
}
|
||||
|
||||
bool X86PassConfig::addPreRewrite() {
|
||||
addPass(createX86TileConfigPass());
|
||||
return true;
|
||||
}
|
||||
|
||||
std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
|
||||
return getStandardCSEConfigForOpt(TM->getOptLevel());
|
||||
}
|
||||
|
|
|
@ -0,0 +1,248 @@
|
|||
//===-- X86TileConfig.cpp - Tile Register Configure----------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file Pass to config the shape of AMX physical registers
|
||||
/// AMX register need to be configured before use. In X86PreTileConfig pass
|
||||
/// the pldtilecfg instruction is inserted, however at that time we don't
|
||||
/// know the shape of each physical tile registers, because the register
|
||||
/// allocation is not done yet. This pass runs after egister allocation
|
||||
/// pass. It collects the shape information of each physical tile register
|
||||
/// and store the shape in the stack slot that is allocated for load config
|
||||
/// to tile config register.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "X86.h"
|
||||
#include "X86InstrBuilder.h"
|
||||
#include "X86MachineFunctionInfo.h"
|
||||
#include "X86RegisterInfo.h"
|
||||
#include "X86Subtarget.h"
|
||||
#include "llvm/CodeGen/LiveIntervals.h"
|
||||
#include "llvm/CodeGen/MachineDominators.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
#include "llvm/CodeGen/TargetInstrInfo.h"
|
||||
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
||||
#include "llvm/CodeGen/TileShapeInfo.h"
|
||||
#include "llvm/CodeGen/VirtRegMap.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "tile-config"
|
||||
|
||||
namespace {
|
||||
|
||||
class X86TileConfig : public MachineFunctionPass {
|
||||
// context
|
||||
MachineFunction *MF = nullptr;
|
||||
const X86Subtarget *ST = nullptr;
|
||||
const TargetRegisterInfo *TRI;
|
||||
const TargetInstrInfo *TII;
|
||||
MachineDominatorTree *DomTree = nullptr;
|
||||
MachineRegisterInfo *MRI = nullptr;
|
||||
VirtRegMap *VRM = nullptr;
|
||||
LiveIntervals *LIS = nullptr;
|
||||
|
||||
MachineInstr *getTileConfigPoint();
|
||||
void tileConfig();
|
||||
|
||||
public:
|
||||
X86TileConfig() : MachineFunctionPass(ID) {}
|
||||
|
||||
/// Return the pass name.
|
||||
StringRef getPassName() const override { return "Tile Register Configure"; }
|
||||
|
||||
/// X86TileConfig analysis usage.
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override;
|
||||
|
||||
/// Perform register allocation.
|
||||
bool runOnMachineFunction(MachineFunction &mf) override;
|
||||
|
||||
MachineFunctionProperties getRequiredProperties() const override {
|
||||
return MachineFunctionProperties().set(
|
||||
MachineFunctionProperties::Property::NoPHIs);
|
||||
}
|
||||
|
||||
static char ID;
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
char X86TileConfig::ID = 0;
|
||||
|
||||
INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure",
|
||||
false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
|
||||
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
|
||||
INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure",
|
||||
false, false)
|
||||
|
||||
void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
|
||||
AU.addRequired<MachineDominatorTree>();
|
||||
AU.addRequired<LiveIntervals>();
|
||||
AU.addPreserved<SlotIndexes>();
|
||||
AU.addRequired<VirtRegMap>();
|
||||
AU.setPreservesAll();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
|
||||
static unsigned getTilePhysRegIndex(Register PhysReg) {
|
||||
assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) &&
|
||||
"Tile register number is invalid");
|
||||
return (PhysReg - X86::TMM0);
|
||||
}
|
||||
|
||||
static MachineInstr *
|
||||
storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
|
||||
Register SrcReg, unsigned BitSize, int FrameIdx, int Offset,
|
||||
const TargetInstrInfo *TII, const TargetRegisterClass *RC,
|
||||
const TargetRegisterInfo *TRI) {
|
||||
|
||||
unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit;
|
||||
unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr;
|
||||
if (BitSize == TRI->getRegSizeInBits(*RC))
|
||||
SubIdx = 0;
|
||||
MachineInstr *NewMI =
|
||||
addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx,
|
||||
Offset)
|
||||
.addReg(SrcReg, 0, SubIdx);
|
||||
return NewMI;
|
||||
}
|
||||
|
||||
static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator MI,
|
||||
int64_t Imm, unsigned BitSize,
|
||||
int FrameIdx, int Offset,
|
||||
const TargetInstrInfo *TII) {
|
||||
unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi;
|
||||
return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)),
|
||||
FrameIdx, Offset)
|
||||
.addImm(Imm);
|
||||
}
|
||||
|
||||
MachineInstr *X86TileConfig::getTileConfigPoint() {
|
||||
for (MachineBasicBlock &MBB : *MF) {
|
||||
|
||||
// Traverse the basic block.
|
||||
for (MachineInstr &MI : MBB)
|
||||
// Refer X86PreTileConfig.cpp.
|
||||
// We only support one tile config for now.
|
||||
if (MI.getOpcode() == X86::PLDTILECFG)
|
||||
return &MI;
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void X86TileConfig::tileConfig() {
|
||||
MachineInstr *MI = getTileConfigPoint();
|
||||
if (!MI)
|
||||
return;
|
||||
MachineBasicBlock *MBB = MI->getParent();
|
||||
int SS = MI->getOperand(1).getIndex();
|
||||
BitVector PhysRegs(TRI->getNumRegs());
|
||||
|
||||
// Fill in the palette first.
|
||||
auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII);
|
||||
LIS->InsertMachineInstrInMaps(*NewMI);
|
||||
// Fill in the shape of each tile physical register.
|
||||
for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
|
||||
Register VirtReg = Register::index2VirtReg(i);
|
||||
if (MRI->reg_nodbg_empty(VirtReg))
|
||||
continue;
|
||||
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
|
||||
if (RC.getID() != X86::TILERegClassID)
|
||||
continue;
|
||||
Register PhysReg = VRM->getPhys(VirtReg);
|
||||
if (PhysRegs.test(PhysReg))
|
||||
continue;
|
||||
PhysRegs.set(PhysReg);
|
||||
ShapeT Shape = VRM->getShape(VirtReg);
|
||||
Register RowReg = Shape.getRow()->getReg();
|
||||
Register ColReg = Shape.getCol()->getReg();
|
||||
|
||||
// Here is the data format for the tile config.
|
||||
// 0 palette
|
||||
// 1 start_row
|
||||
// 2-15 reserved, must be zero
|
||||
// 16-17 tile0.colsb Tile 0 bytes per row.
|
||||
// 18-19 tile1.colsb Tile 1 bytes per row.
|
||||
// 20-21 tile2.colsb Tile 2 bytes per row.
|
||||
// ... (sequence continues)
|
||||
// 30-31 tile7.colsb Tile 7 bytes per row.
|
||||
// 32-47 reserved, must be zero
|
||||
// 48 tile0.rows Tile 0 rows.
|
||||
// 49 tile1.rows Tile 1 rows.
|
||||
// 50 tile2.rows Tile 2 rows.
|
||||
// ... (sequence continues)
|
||||
// 55 tile7.rows Tile 7 rows.
|
||||
// 56-63 reserved, must be zero
|
||||
unsigned Index = getTilePhysRegIndex(PhysReg);
|
||||
int RowOffset = 48 + Index;
|
||||
int ColOffset = 16 + Index * 2;
|
||||
|
||||
unsigned BitSize = 8;
|
||||
for (const auto &Pair : {std::make_pair(RowReg, RowOffset),
|
||||
std::make_pair(ColReg, ColOffset)}) {
|
||||
int64_t Imm;
|
||||
int ImmCount = 0;
|
||||
// All def must be the same value, otherwise it is invalid MIs.
|
||||
// Immediate is prefered.
|
||||
for (const MachineOperand &MO : MRI->def_operands(Pair.first)) {
|
||||
const auto *Inst = MO.getParent();
|
||||
if (Inst->isMoveImmediate()) {
|
||||
ImmCount++;
|
||||
Imm = Inst->getOperand(1).getImm();
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto StoreConfig = [&](int Offset) {
|
||||
MachineInstr *NewMI = nullptr;
|
||||
if (ImmCount)
|
||||
NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII);
|
||||
else {
|
||||
const TargetRegisterClass *RC = MRI->getRegClass(Pair.first);
|
||||
NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS,
|
||||
Offset, TII, RC, TRI);
|
||||
}
|
||||
SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI);
|
||||
if (!ImmCount) {
|
||||
// Extend the live interval.
|
||||
SmallVector<SlotIndex, 8> EndPoints = {SIdx.getRegSlot()};
|
||||
LiveInterval &Int = LIS->getInterval(Pair.first);
|
||||
LIS->extendToIndices(Int, EndPoints);
|
||||
}
|
||||
};
|
||||
StoreConfig(Pair.second);
|
||||
BitSize += 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) {
|
||||
MF = &mf;
|
||||
MRI = &mf.getRegInfo();
|
||||
ST = &mf.getSubtarget<X86Subtarget>();
|
||||
TRI = ST->getRegisterInfo();
|
||||
TII = mf.getSubtarget().getInstrInfo();
|
||||
DomTree = &getAnalysis<MachineDominatorTree>();
|
||||
VRM = &getAnalysis<VirtRegMap>();
|
||||
LIS = &getAnalysis<LiveIntervals>();
|
||||
|
||||
if (VRM->isShapeMapEmpty())
|
||||
return false;
|
||||
|
||||
tileConfig();
|
||||
return true;
|
||||
}
|
||||
|
||||
FunctionPass *llvm::createX86TileConfigPass() { return new X86TileConfig(); }
|
|
@ -0,0 +1,91 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
|
||||
|
||||
@buf = dso_local global [3072 x i8] zeroinitializer, align 16
|
||||
|
||||
define dso_local void @test_api(i16 signext %0, i16 signext %1) local_unnamed_addr #2 {
|
||||
; CHECK-LABEL: test_api:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: pushq %rbp
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
||||
; CHECK-NEXT: pushq %r15
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 24
|
||||
; CHECK-NEXT: pushq %r14
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 32
|
||||
; CHECK-NEXT: pushq %rbx
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 40
|
||||
; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 4096
|
||||
; CHECK-NEXT: .cfi_offset %rbx, -40
|
||||
; CHECK-NEXT: .cfi_offset %r14, -32
|
||||
; CHECK-NEXT: .cfi_offset %r15, -24
|
||||
; CHECK-NEXT: .cfi_offset %rbp, -16
|
||||
; CHECK-NEXT: movl %esi, %ebx
|
||||
; CHECK-NEXT: movl %edi, %ebp
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
|
||||
; CHECK-NEXT: movl $buf, %eax
|
||||
; CHECK-NEXT: movl $32, %r14d
|
||||
; CHECK-NEXT: movw $8, %r15w
|
||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1
|
||||
; CHECK-NEXT: movabsq $64, %rax
|
||||
; CHECK-NEXT: tilestored %tmm1, 2048(%rsp,%rax) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: movl $buf+1024, %eax
|
||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
|
||||
; CHECK-NEXT: movabsq $64, %rax
|
||||
; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: movl $buf+2048, %eax
|
||||
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
|
||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
|
||||
; CHECK-NEXT: movabsq $64, %rcx
|
||||
; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: movabsq $64, %rcx
|
||||
; CHECK-NEXT: tileloadd 1024(%rsp,%rcx), %tmm2 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rax,%r14)
|
||||
; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 40
|
||||
; CHECK-NEXT: popq %rbx
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 32
|
||||
; CHECK-NEXT: popq %r14
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 24
|
||||
; CHECK-NEXT: popq %r15
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
||||
; CHECK-NEXT: popq %rbp
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 8
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: retq
|
||||
%3 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) #4
|
||||
%4 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) #4
|
||||
tail call void (...) @foo() #4
|
||||
%5 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) #4
|
||||
%6 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, <256 x i32> %5, <256 x i32> %3, <256 x i32> %4) #4
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, <256 x i32> %6) #4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare dso_local void @foo(...) local_unnamed_addr #3
|
||||
|
||||
declare <256 x i32> @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #4
|
||||
declare <256 x i32> @llvm.x86.tdpbssd.internal(i16, i16, i16, <256 x i32>, <256 x i32>, <256 x i32>) #4
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, <256 x i32>) #4
|
||||
|
||||
attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="8192" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #4 = { nounwind }
|
|
@ -0,0 +1,77 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -verify-machineinstrs | FileCheck %s
|
||||
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) local_unnamed_addr #2 {
|
||||
; CHECK-LABEL: test_api:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: movsbl %sil, %eax
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: testl %edi, %edi
|
||||
; CHECK-NEXT: je .LBB0_2
|
||||
; CHECK-NEXT: # %bb.1:
|
||||
; CHECK-NEXT: movl $buf, %ecx
|
||||
; CHECK-NEXT: jmp .LBB0_3
|
||||
; CHECK-NEXT: .LBB0_2:
|
||||
; CHECK-NEXT: movl $buf2, %ecx
|
||||
; CHECK-NEXT: .LBB0_3:
|
||||
; CHECK-NEXT: movl $32, %edi
|
||||
; CHECK-NEXT: tileloadd (%rcx,%rdi), %tmm0
|
||||
; CHECK-NEXT: tileloadd (%rcx,%rdi), %tmm2
|
||||
; CHECK-NEXT: tileloadd (%rcx,%rdi), %tmm1
|
||||
; CHECK-NEXT: tdpbssd %tmm2, %tmm0, %tmm1
|
||||
; CHECK-NEXT: movl $buf, %ecx
|
||||
; CHECK-NEXT: movl $32, %esi
|
||||
; CHECK-NEXT: tilestored %tmm1, (%rcx,%rsi)
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%4 = icmp eq i32 %0, 0
|
||||
%5 = shl i16 %1, 8
|
||||
%6 = ashr exact i16 %5, 8
|
||||
br i1 %4, label %11, label %7
|
||||
|
||||
7: ; preds = %3
|
||||
%8 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
%9 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
%10 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
br label %15
|
||||
|
||||
11: ; preds = %3
|
||||
%12 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
|
||||
%13 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
|
||||
%14 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
|
||||
br label %15
|
||||
|
||||
15: ; preds = %11, %7
|
||||
%16 = phi <256 x i32> [ %12, %11 ], [ %8, %7 ]
|
||||
%17 = phi <256 x i32> [ %13, %11 ], [ %9, %7 ]
|
||||
%18 = phi <256 x i32> [ %14, %11 ], [ %10, %7 ]
|
||||
%19 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %6, i16 %2, i16 %1, <256 x i32> %18, <256 x i32> %16, <256 x i32> %17) #3
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, <256 x i32> %19) #3
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <256 x i32> @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #3
|
||||
|
||||
declare <256 x i32> @llvm.x86.tdpbssd.internal(i16, i16, i16, <256 x i32>, <256 x i32>, <256 x i32>) #3
|
||||
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, <256 x i32>) #3
|
||||
|
||||
attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="8192" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+avx,+avx2,+avx512f,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #3 = { nounwind }
|
|
@ -0,0 +1,112 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
|
||||
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
|
||||
define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) local_unnamed_addr #2 {
|
||||
; CHECK-LABEL: test_api:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: subq $2936, %rsp # imm = 0xB78
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 2944
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %si, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movl $buf, %r8d
|
||||
; CHECK-NEXT: movl $32, %eax
|
||||
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1
|
||||
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1
|
||||
; CHECK-NEXT: movabsq $64, %rcx
|
||||
; CHECK-NEXT: tilestored %tmm1, 896(%rsp,%rcx) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm3
|
||||
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm4
|
||||
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm2
|
||||
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm5
|
||||
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm0
|
||||
; CHECK-NEXT: testl %edi, %edi
|
||||
; CHECK-NEXT: je .LBB0_2
|
||||
; CHECK-NEXT: # %bb.1:
|
||||
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm6
|
||||
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm7
|
||||
; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1
|
||||
; CHECK-NEXT: jmp .LBB0_3
|
||||
; CHECK-NEXT: .LBB0_2:
|
||||
; CHECK-NEXT: movl $buf2, %ecx
|
||||
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm6
|
||||
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm7
|
||||
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1
|
||||
; CHECK-NEXT: .LBB0_3:
|
||||
; CHECK-NEXT: tdpbssd %tmm7, %tmm6, %tmm1
|
||||
; CHECK-NEXT: movabsq $64, %rax
|
||||
; CHECK-NEXT: tileloadd 896(%rsp,%rax), %tmm7 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: tdpbssd %tmm7, %tmm1, %tmm3
|
||||
; CHECK-NEXT: tdpbssd %tmm4, %tmm3, %tmm2
|
||||
; CHECK-NEXT: tdpbssd %tmm5, %tmm2, %tmm0
|
||||
; CHECK-NEXT: movl $buf, %eax
|
||||
; CHECK-NEXT: movl $32, %ecx
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rax,%rcx)
|
||||
; CHECK-NEXT: addq $2936, %rsp # imm = 0xB78
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 8
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%4 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
%5 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
%6 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
%7 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
%8 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
%9 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
%10 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
%11 = icmp eq i32 %0, 0
|
||||
br i1 %11, label %16, label %12
|
||||
|
||||
12: ; preds = %3
|
||||
%13 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
%14 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
%15 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
|
||||
br label %20
|
||||
|
||||
16: ; preds = %3
|
||||
%17 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
|
||||
%18 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
|
||||
%19 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
|
||||
br label %20
|
||||
|
||||
20: ; preds = %16, %12
|
||||
%21 = phi <256 x i32> [ %17, %16 ], [ %13, %12 ]
|
||||
%22 = phi <256 x i32> [ %18, %16 ], [ %14, %12 ]
|
||||
%23 = phi <256 x i32> [ %19, %16 ], [ %15, %12 ]
|
||||
%24 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %1, <256 x i32> %23, <256 x i32> %21, <256 x i32> %22) #3
|
||||
%25 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %2, <256 x i32> %6, <256 x i32> %24, <256 x i32> %5) #3
|
||||
%26 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %2, <256 x i32> %8, <256 x i32> %25, <256 x i32> %7) #3
|
||||
%27 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %2, i16 %2, i16 %2, <256 x i32> %10, <256 x i32> %26, <256 x i32> %9) #3
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, <256 x i32> %27) #3
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <256 x i32> @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #3
|
||||
declare <256 x i32> @llvm.x86.tdpbssd.internal(i16, i16, i16, <256 x i32>, <256 x i32>, <256 x i32>) #3
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, <256 x i32>) #3
|
||||
|
||||
attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="8192" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #3 = { nounwind }
|
|
@ -0,0 +1,143 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -lower-amx-type %s -S | FileCheck %s
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
%struct.__tile_str = type { i16, i16, <256 x i32> }
|
||||
|
||||
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
|
||||
define dso_local void @test_load(i8* %in, i8* %out) local_unnamed_addr #2 {
|
||||
; CHECK-LABEL: @test_load(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[IN:%.*]] to <256 x i32>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[OUT:%.*]] to <256 x i32>*
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i32>* [[TMP1]] to <128 x i32>*
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load <128 x i32>, <128 x i32>* [[TMP3]], align 64
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <128 x i32>, <128 x i32>* [[TMP3]], i32 1
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = load <128 x i32>, <128 x i32>* [[TMP5]], align 64
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <256 x i32>* [[TMP2]] to <128 x i32>*
|
||||
; CHECK-NEXT: store <128 x i32> [[TMP4]], <128 x i32>* [[TMP7]], align 64
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <128 x i32>, <128 x i32>* [[TMP7]], i32 1
|
||||
; CHECK-NEXT: store <128 x i32> [[TMP6]], <128 x i32>* [[TMP8]], align 64
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%1 = bitcast i8* %in to <256 x i32>*
|
||||
%2 = bitcast i8* %out to <256 x i32>*
|
||||
%3 = load <256 x i32>, <256 x i32>* %1, align 64, !tbaa !8
|
||||
store <256 x i32> %3, <256 x i32>* %2, align 64, !tbaa !8
|
||||
ret void
|
||||
}
|
||||
|
||||
define dso_local void @__tile_loadd(%struct.__tile_str* nocapture %0, i8* %1, i64 %2) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: @__tile_loadd(
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP4]], align 64, [[TBAA2:!tbaa !.*]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0]], i64 0, i32 1
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[TMP6]], align 2, [[TBAA7:!tbaa !.*]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP2:%.*]], 32
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = ashr exact i64 [[TMP8]], 32
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP1:%.*]], i64 [[TMP9]]) [[ATTR3:#.*]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0]], i64 0, i32 2
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP11]] to i8*
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP12]], i64 64, <256 x i32> [[TMP10]])
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 0
|
||||
%5 = load i16, i16* %4, align 64, !tbaa !2
|
||||
%6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 1
|
||||
%7 = load i16, i16* %6, align 2, !tbaa !7
|
||||
%8 = shl i64 %2, 32
|
||||
%9 = ashr exact i64 %8, 32
|
||||
%10 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %5, i16 %7, i8* %1, i64 %9) #3
|
||||
%11 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 2
|
||||
store <256 x i32> %10, <256 x i32>* %11, align 64, !tbaa !8
|
||||
ret void
|
||||
}
|
||||
|
||||
define dso_local void @__tile_dpbsud(%struct.__tile_str* nocapture %0, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %1, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: @__tile_dpbsud(
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP1:%.*]], i64 0, i32 0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP4]], align 64, [[TBAA2]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2:%.*]], i64 0, i32 1
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[TMP6]], align 2, [[TBAA7]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 1
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP8]], align 2, [[TBAA7]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 2
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <256 x i32>* [[TMP10]] to i8*
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP11]], i64 64)
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 2
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <256 x i32>* [[TMP13]] to i8*
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP9]], i8* [[TMP14]], i64 64)
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = bitcast <256 x i32>* [[TMP16]] to i8*
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP7]], i8* [[TMP17]], i64 64)
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 [[TMP5]], i16 [[TMP7]], i16 [[TMP9]], <256 x i32> [[TMP12]], <256 x i32> [[TMP15]], <256 x i32> [[TMP18]]) [[ATTR3]]
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = bitcast <256 x i32>* [[TMP10]] to i8*
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP20]], i64 64, <256 x i32> [[TMP19]])
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 0
|
||||
%5 = load i16, i16* %4, align 64, !tbaa !2
|
||||
%6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 1
|
||||
%7 = load i16, i16* %6, align 2, !tbaa !7
|
||||
%8 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 1
|
||||
%9 = load i16, i16* %8, align 2, !tbaa !7
|
||||
%10 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 2
|
||||
%11 = load <256 x i32>, <256 x i32>* %10, align 64, !tbaa !8
|
||||
%12 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 2
|
||||
%13 = load <256 x i32>, <256 x i32>* %12, align 64, !tbaa !8
|
||||
%14 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 2
|
||||
%15 = load <256 x i32>, <256 x i32>* %14, align 64, !tbaa !8
|
||||
%16 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %5, i16 %7, i16 %9, <256 x i32> %11, <256 x i32> %13, <256 x i32> %15) #3
|
||||
store <256 x i32> %16, <256 x i32>* %10, align 64, !tbaa !8
|
||||
ret void
|
||||
}
|
||||
|
||||
define dso_local void @__tile_stored(i8* %0, i64 %1, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr #1 {
|
||||
; CHECK-LABEL: @__tile_stored(
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP2:%.*]], i64 0, i32 0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP4]], align 64, [[TBAA2]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 1
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[TMP6]], align 2, [[TBAA7]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <256 x i32>* [[TMP8]] to i8*
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP9]], i64 64)
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = shl i64 [[TMP1:%.*]], 32
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = ashr exact i64 [[TMP11]], 32
|
||||
; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP0:%.*]], i64 [[TMP12]], <256 x i32> [[TMP10]]) [[ATTR3]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 0
|
||||
%5 = load i16, i16* %4, align 64, !tbaa !2
|
||||
%6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 1
|
||||
%7 = load i16, i16* %6, align 2, !tbaa !7
|
||||
%8 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 2
|
||||
%9 = load <256 x i32>, <256 x i32>* %8, align 64, !tbaa !8
|
||||
%10 = shl i64 %1, 32
|
||||
%11 = ashr exact i64 %10, 32
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %7, i8* %0, i64 %11, <256 x i32> %9) #3
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <256 x i32> @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #3
|
||||
declare <256 x i32> @llvm.x86.tdpbssd.internal(i16, i16, i16, <256 x i32>, <256 x i32>, <256 x i32>) #3
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, <256 x i32>) #3
|
||||
|
||||
attributes #0 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="8192" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+avx,+avx2,+avx512f,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #3 = { nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0}
|
||||
!llvm.ident = !{!1}
|
||||
|
||||
!0 = !{i32 1, !"wchar_size", i32 4}
|
||||
!1 = !{!"clang version 12.0.0 (ssh://git-amr-1.devtools.intel.com:29418/dpd_icl-llvm_project_worldread f3c78a3f053379a2511e00e9ce2c13383ea3f835)"}
|
||||
!2 = !{!3, !4, i64 0}
|
||||
!3 = !{!"__tile_str", !4, i64 0, !4, i64 2, !5, i64 1024}
|
||||
!4 = !{!"short", !5, i64 0}
|
||||
!5 = !{!"omnipotent char", !6, i64 0}
|
||||
!6 = !{!"Simple C/C++ TBAA"}
|
||||
!7 = !{!3, !4, i64 2}
|
||||
!8 = !{!5, !5, i64 0}
|
|
@ -18,6 +18,7 @@
|
|||
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
|
||||
; CHECK-NEXT: FunctionPass Manager
|
||||
; CHECK-NEXT: Expand Atomic instructions
|
||||
; CHECK-NEXT: Lower AMX type for load/store
|
||||
; CHECK-NEXT: Module Verifier
|
||||
; CHECK-NEXT: Lower Garbage Collection Instructions
|
||||
; CHECK-NEXT: Shadow Stack GC Lowering
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
target triple = "x86_64-unknown-unknown"
|
||||
declare void @bar1()
|
||||
define preserve_allcc void @foo()#0 {
|
||||
; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $fpcw $fpsw $fs $gs $hip $ip $mxcsr $rip $riz $ss $ssp $bnd0 $bnd1 $bnd2 $bnd3 $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $r11b $r11bh $r11d $r11w $r11wh $k0_k1 $k2_k3 $k4_k5 $k6_k7
|
||||
; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $fpcw $fpsw $fs $gs $hip $ip $mxcsr $rip $riz $ss $ssp $tmmcfg $bnd0 $bnd1 $bnd2 $bnd3 $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $r11b $r11bh $r11d $r11w $r11wh $k0_k1 $k2_k3 $k4_k5 $k6_k7
|
||||
call void @bar1()
|
||||
call void @bar2()
|
||||
ret void
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
|
||||
; CHECK-NEXT: FunctionPass Manager
|
||||
; CHECK-NEXT: Expand Atomic instructions
|
||||
; CHECK-NEXT: Lower AMX type for load/store
|
||||
; CHECK-NEXT: Module Verifier
|
||||
; CHECK-NEXT: Dominator Tree Construction
|
||||
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
|
||||
|
@ -118,11 +119,12 @@
|
|||
; CHECK-NEXT: MachineDominator Tree Construction
|
||||
; CHECK-NEXT: X86 EFLAGS copy lowering
|
||||
; CHECK-NEXT: X86 WinAlloca Expander
|
||||
; CHECK-NEXT: MachineDominator Tree Construction
|
||||
; CHECK-NEXT: Tile Register Pre-configure
|
||||
; CHECK-NEXT: Detect Dead Lanes
|
||||
; CHECK-NEXT: Process Implicit Definitions
|
||||
; CHECK-NEXT: Remove unreachable machine basic blocks
|
||||
; CHECK-NEXT: Live Variable Analysis
|
||||
; CHECK-NEXT: MachineDominator Tree Construction
|
||||
; CHECK-NEXT: Machine Natural Loop Construction
|
||||
; CHECK-NEXT: Eliminate PHI nodes for register allocation
|
||||
; CHECK-NEXT: Two-Address instruction pass
|
||||
|
@ -141,6 +143,7 @@
|
|||
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
|
||||
; CHECK-NEXT: Machine Optimization Remark Emitter
|
||||
; CHECK-NEXT: Greedy Register Allocator
|
||||
; CHECK-NEXT: Tile Register Configure
|
||||
; CHECK-NEXT: Virtual Register Rewriter
|
||||
; CHECK-NEXT: Stack Slot Coloring
|
||||
; CHECK-NEXT: Machine Copy Propagation Pass
|
||||
|
|
|
@ -91,7 +91,7 @@ body: |
|
|||
; CHECK-DAG: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $rdi :: (store 8 into %stack.1)
|
||||
; CHECK: EH_LABEL <mcsymbol .Ltmp0>
|
||||
; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
|
||||
; CHECK: STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 5, 2, 0, 2, -1, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on %stack.1), (load store 8 on %stack.0)
|
||||
; CHECK: STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 5, 2, 0, 2, -1, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on %stack.0), (load store 8 on %stack.1)
|
||||
; CHECK-DAG: $r14 = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load 8 from %stack.0)
|
||||
; CHECK-DAG: $rbx = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load 8 from %stack.1)
|
||||
; CHECK: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
|
||||
|
|
|
@ -108,7 +108,7 @@ body: |
|
|||
; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
|
||||
; CHECK: MOV64mr [[STACK0:%stack.[0-9]+]], 1, $noreg, 0, $noreg, killed $rbx :: (store 8 into [[STACK0]])
|
||||
; CHECK: MOV64mr [[STACK1:%stack.[0-9]+]], 1, $noreg, 0, $noreg, killed $r14 :: (store 8 into [[STACK1]])
|
||||
; CHECK: STATEPOINT 0, 0, 0, @foo, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, [[STACK0]], 0, 1, 8, [[STACK1]], 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on [[STACK0]]), (load store 8 on [[STACK1]])
|
||||
; CHECK: STATEPOINT 0, 0, 0, @foo, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, [[STACK0]], 0, 1, 8, [[STACK1]], 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on [[STACK1]]), (load store 8 on [[STACK0]])
|
||||
; CHECK-DAG: $rbx = MOV64rm [[STACK0]], 1, $noreg, 0, $noreg :: (load 8 from [[STACK0]])
|
||||
; CHECK-DAG: $r14 = MOV64rm [[STACK1]], 1, $noreg, 0, $noreg :: (load 8 from [[STACK1]])
|
||||
; CHECK: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
|
||||
|
@ -121,7 +121,7 @@ body: |
|
|||
; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
|
||||
; CHECK-DAG: MOV64mr [[STACK0]], 1, $noreg, 0, $noreg, killed $rbx :: (store 8 into [[STACK0]])
|
||||
; CHECK-DAG: MOV64mr [[STACK1]], 1, $noreg, 0, $noreg, killed $r14 :: (store 8 into [[STACK1]])
|
||||
; CHECK: STATEPOINT 0, 0, 0, @bar, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, %stack.0, 0, 1, 8, [[STACK1]], 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on [[STACK0]]), (load store 8 on [[STACK1]])
|
||||
; CHECK: STATEPOINT 0, 0, 0, @bar, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, %stack.0, 0, 1, 8, [[STACK1]], 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on [[STACK1]]), (load store 8 on [[STACK0]])
|
||||
; CHECK-DAG: $rbx = MOV64rm [[STACK0]], 1, $noreg, 0, $noreg :: (load 8 from [[STACK0]])
|
||||
; CHECK-DAG: $r14 = MOV64rm [[STACK1]], 1, $noreg, 0, $noreg :: (load 8 from [[STACK1]])
|
||||
; CHECK: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
|
||||
|
|
Loading…
Reference in New Issue