[AMDGPU] new gfx940 fp atomics
Differential Revision: https://reviews.llvm.org/D121028
This commit is contained in:
parent
89d5c31b78
commit
932f628121
|
@ -209,6 +209,12 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fmax_f64, "dd*0d", "t", "gfx90a-inst
|
|||
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f64, "dd*3d", "t", "gfx90a-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3f", "t", "gfx8-insts")
|
||||
|
||||
TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_f32, "ff*0f", "t", "gfx940-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2hV2h*0V2h", "t", "gfx940-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "gfx940-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "gfx940-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "gfx940-insts")
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Deep learning builtins.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -16513,7 +16513,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
|||
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
|
||||
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
|
||||
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
|
||||
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
|
||||
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
|
||||
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
|
||||
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
|
||||
Intrinsic::ID IID;
|
||||
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
|
||||
switch (BuiltinID) {
|
||||
|
@ -16544,6 +16546,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
|||
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
|
||||
IID = Intrinsic::amdgcn_flat_atomic_fmax;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
|
||||
ArgTy = llvm::Type::getFloatTy(getLLVMContext());
|
||||
IID = Intrinsic::amdgcn_flat_atomic_fadd;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
|
||||
ArgTy = llvm::FixedVectorType::get(
|
||||
llvm::Type::getHalfTy(getLLVMContext()), 2);
|
||||
IID = Intrinsic::amdgcn_flat_atomic_fadd;
|
||||
break;
|
||||
}
|
||||
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
|
||||
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
|
||||
|
@ -16551,6 +16562,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
|||
CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
|
||||
return Builder.CreateCall(F, {Addr, Val});
|
||||
}
|
||||
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
|
||||
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
|
||||
Intrinsic::ID IID;
|
||||
switch (BuiltinID) {
|
||||
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
|
||||
IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
|
||||
IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
|
||||
break;
|
||||
}
|
||||
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
|
||||
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
|
||||
llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
|
||||
return Builder.CreateCall(F, {Addr, Val});
|
||||
}
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: {
|
||||
Intrinsic::ID IID;
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
|
||||
// RUN: -verify -S -o - %s
|
||||
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
|
||||
typedef half __attribute__((ext_vector_type(2))) half2;
|
||||
typedef short __attribute__((ext_vector_type(2))) short2;
|
||||
|
||||
void test_atomic_fadd(__global half2 *addrh2, half2 xh2,
|
||||
__global short2 *addrs2, __local short2 *addrs2l, short2 xs2,
|
||||
__global float *addrf, float xf) {
|
||||
__builtin_amdgcn_flat_atomic_fadd_f32(addrf, xf); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_f32' needs target feature gfx940-insts}}
|
||||
__builtin_amdgcn_flat_atomic_fadd_v2f16(addrh2, xh2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2f16' needs target feature gfx940-insts}}
|
||||
__builtin_amdgcn_flat_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2bf16' needs target feature gfx940-insts}}
|
||||
__builtin_amdgcn_global_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2bf16' needs target feature gfx940-insts}}
|
||||
__builtin_amdgcn_ds_atomic_fadd_v2bf16(addrs2l, xs2); // expected-error{{'__builtin_amdgcn_ds_atomic_fadd_v2bf16' needs target feature gfx940-insts}}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \
|
||||
// RUN: %s -S -emit-llvm -o - | FileCheck %s
|
||||
|
||||
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \
|
||||
// RUN: -S -o - %s | FileCheck -check-prefix=GFX940 %s
|
||||
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
|
||||
typedef half __attribute__((ext_vector_type(2))) half2;
|
||||
typedef short __attribute__((ext_vector_type(2))) short2;
|
||||
|
||||
// CHECK-LABEL: test_flat_add_f32
|
||||
// CHECK: call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %{{.*}}, float %{{.*}})
|
||||
// GFX940-LABEL: test_flat_add_f32
|
||||
// GFX940: flat_atomic_add_f32
|
||||
half2 test_flat_add_f32(__generic float *addr, float x) {
|
||||
return __builtin_amdgcn_flat_atomic_fadd_f32(addr, x);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_flat_add_2f16
|
||||
// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %{{.*}}, <2 x half> %{{.*}})
|
||||
// GFX940-LABEL: test_flat_add_2f16
|
||||
// GFX940: flat_atomic_pk_add_f16
|
||||
half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
|
||||
return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_flat_add_2bf16
|
||||
// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %{{.*}}, <2 x i16> %{{.*}})
|
||||
// GFX940-LABEL: test_flat_add_2bf16
|
||||
// GFX940: flat_atomic_pk_add_bf16
|
||||
short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
|
||||
return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_global_add_2bf16
|
||||
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %{{.*}}, <2 x i16> %{{.*}})
|
||||
// GFX940-LABEL: test_global_add_2bf16
|
||||
// GFX940: global_atomic_pk_add_bf16
|
||||
short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
|
||||
return __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_local_add_2bf16
|
||||
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %{{.*}}, <2 x i16> %
|
||||
// GFX940-LABEL: test_local_add_2bf16
|
||||
// GFX940: ds_pk_add_rtn_bf16
|
||||
short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
|
||||
return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
|
||||
}
|
|
@ -1978,6 +1978,19 @@ def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, ll
|
|||
def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>;
|
||||
def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// gfx940 intrinsics
|
||||
// ===----------------------------------------------------------------------===//
|
||||
|
||||
// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
|
||||
def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
|
||||
def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
|
||||
def int_amdgcn_ds_fadd_v2bf16 : Intrinsic<
|
||||
[llvm_v2i16_ty],
|
||||
[LLVMQualPointerType<llvm_v2i16_ty, 3>, llvm_v2i16_ty],
|
||||
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>]>,
|
||||
GCCBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Special Intrinsics for backend internal use only. No frontend
|
||||
// should emit calls to these.
|
||||
|
|
|
@ -1558,6 +1558,13 @@ def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">,
|
|||
def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">,
|
||||
AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>;
|
||||
|
||||
// Differentiate between two functionally equivalent, but incompatible
|
||||
// encoding-wise FP atomics between gfx90* and gfx940
|
||||
def HasAtomicFaddInstsGFX90X : Predicate<"Subtarget->hasAtomicFaddInsts()">,
|
||||
AssemblerPredicate<(all_of FeatureAtomicFaddInsts, (not FeatureGFX940Insts))>;
|
||||
def HasAtomicFaddInstsGFX940 : Predicate<"Subtarget->hasAtomicFaddInsts()">,
|
||||
AssemblerPredicate<(all_of FeatureAtomicFaddInsts, FeatureGFX940Insts)>;
|
||||
|
||||
def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
|
||||
AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
|
||||
|
||||
|
|
|
@ -1297,6 +1297,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
|
|||
Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
|
||||
if (ST.hasGFX90AInsts())
|
||||
Atomic.legalFor({{S64, LocalPtr}});
|
||||
if (ST.hasGFX940Insts())
|
||||
Atomic.legalFor({{V2S16, LocalPtr}});
|
||||
}
|
||||
if (ST.hasAtomicFaddInsts())
|
||||
Atomic.legalFor({{S32, GlobalPtr}});
|
||||
|
|
|
@ -4339,6 +4339,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
|||
case Intrinsic::amdgcn_flat_atomic_fadd:
|
||||
case Intrinsic::amdgcn_flat_atomic_fmin:
|
||||
case Intrinsic::amdgcn_flat_atomic_fmax:
|
||||
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
|
||||
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
|
||||
return getDefaultMappingAllVGPR(MI);
|
||||
case Intrinsic::amdgcn_ds_ordered_add:
|
||||
case Intrinsic::amdgcn_ds_ordered_swap: {
|
||||
|
|
|
@ -205,9 +205,12 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
|
|||
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
|
||||
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
|
||||
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
|
||||
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
|
||||
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
|
||||
def : SourceOfDivergence<int_amdgcn_ds_fadd>;
|
||||
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
|
||||
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
|
||||
def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>;
|
||||
def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>;
|
||||
def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>;
|
||||
def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>;
|
||||
|
|
|
@ -462,6 +462,13 @@ let SubtargetPredicate = isGFX90APlus in {
|
|||
defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64, "ds_add_f64">;
|
||||
} // End SubtargetPredicate = isGFX90APlus
|
||||
|
||||
let SubtargetPredicate = isGFX940Plus in {
|
||||
defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">;
|
||||
defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">;
|
||||
defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">;
|
||||
defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">;
|
||||
} // End SubtargetPredicate = isGFX940Plus
|
||||
|
||||
defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">;
|
||||
defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">;
|
||||
defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">;
|
||||
|
@ -998,6 +1005,14 @@ def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_ret_64>;
|
|||
def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
|
||||
}
|
||||
|
||||
let SubtargetPredicate = isGFX940Plus in {
|
||||
def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>;
|
||||
def : GCNPat <
|
||||
(v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
|
||||
(DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
|
||||
>;
|
||||
}
|
||||
|
||||
def : Pat <
|
||||
(SIds_ordered_count i32:$value, i16:$offset),
|
||||
(DS_ORDERED_COUNT $value, (as_i16imm $offset))
|
||||
|
@ -1410,3 +1425,10 @@ let SubtargetPredicate = isGFX90APlus in {
|
|||
def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>;
|
||||
def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
|
||||
} // End SubtargetPredicate = isGFX90APlus
|
||||
|
||||
let SubtargetPredicate = isGFX940Plus in {
|
||||
def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>;
|
||||
def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
|
||||
def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
|
||||
def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
|
||||
} // End SubtargetPredicate = isGFX940Plus
|
||||
|
|
|
@ -630,6 +630,13 @@ let SubtargetPredicate = isGFX90APlus in {
|
|||
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
|
||||
} // End SubtargetPredicate = isGFX90APlus
|
||||
|
||||
let SubtargetPredicate = isGFX940Plus in {
|
||||
defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32, int_amdgcn_flat_atomic_fadd>;
|
||||
defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_flat_atomic_fadd>;
|
||||
defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>;
|
||||
defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>;
|
||||
} // End SubtargetPredicate = isGFX940Plus
|
||||
|
||||
defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
|
||||
defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
|
||||
defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
|
||||
|
@ -1280,6 +1287,13 @@ def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64_RTN, atomic_load_fmax_flat_ret_64
|
|||
def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64, atomic_load_fmax_flat_noret_64, f64>;
|
||||
}
|
||||
|
||||
let OtherPredicates = [isGFX940Plus] in {
|
||||
def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F32_RTN, atomic_load_fadd_flat_32, f32>;
|
||||
def : FlatSignedAtomicPat <FLAT_ATOMIC_PK_ADD_F16_RTN, atomic_load_fadd_v2f16_flat_32, v2f16>;
|
||||
def : FlatSignedAtomicPat <FLAT_ATOMIC_PK_ADD_BF16_RTN, int_amdgcn_flat_atomic_fadd_v2bf16, v2i16>;
|
||||
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_PK_ADD_BF16", int_amdgcn_global_atomic_fadd_v2bf16, v2i16>;
|
||||
}
|
||||
|
||||
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
|
||||
|
||||
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
|
||||
|
@ -1432,6 +1446,14 @@ multiclass FLAT_Real_AllAddr_vi<bits<7> op,
|
|||
def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
|
||||
}
|
||||
|
||||
class FLAT_Real_gfx940 <bits<7> op, FLAT_Pseudo ps> :
|
||||
FLAT_Real <op, ps>,
|
||||
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX940> {
|
||||
let AssemblerPredicate = isGFX940Plus;
|
||||
let DecoderNamespace = "GFX9";
|
||||
let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue);
|
||||
}
|
||||
|
||||
def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
|
||||
def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
|
||||
def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
|
||||
|
@ -1574,7 +1596,7 @@ defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>;
|
|||
defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>;
|
||||
defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>;
|
||||
|
||||
let SubtargetPredicate = HasAtomicFaddInsts in {
|
||||
let SubtargetPredicate = HasAtomicFaddInstsGFX90X in {
|
||||
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>;
|
||||
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>;
|
||||
}
|
||||
|
@ -1588,6 +1610,40 @@ let SubtargetPredicate = isGFX90AOnly in {
|
|||
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51, 0>;
|
||||
} // End SubtargetPredicate = isGFX90AOnly
|
||||
|
||||
multiclass FLAT_Real_AllAddr_gfx940<bits<7> op> {
|
||||
def _gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
|
||||
def _SADDR_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
|
||||
}
|
||||
|
||||
multiclass FLAT_Real_Atomics_gfx940 <bits<7> op, FLAT_Pseudo ps> {
|
||||
def _gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
|
||||
def _RTN_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
|
||||
}
|
||||
|
||||
multiclass FLAT_Global_Real_Atomics_gfx940<bits<7> op> :
|
||||
FLAT_Real_AllAddr_gfx940<op> {
|
||||
def _RTN_gfx940 : FLAT_Real_gfx940 <op, !cast<FLAT_Pseudo>(NAME#"_RTN")>;
|
||||
def _SADDR_RTN_gfx940 : FLAT_Real_gfx940 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
|
||||
}
|
||||
|
||||
let SubtargetPredicate = HasAtomicFaddInstsGFX940 in {
|
||||
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_gfx940 <0x04d>;
|
||||
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_gfx940 <0x04e>;
|
||||
}
|
||||
|
||||
let SubtargetPredicate = isGFX940Plus in {
|
||||
defm FLAT_ATOMIC_ADD_F64 : FLAT_Real_Atomics_gfx940<0x4f, FLAT_ATOMIC_ADD_F64>;
|
||||
defm FLAT_ATOMIC_MIN_F64 : FLAT_Real_Atomics_gfx940<0x50, FLAT_ATOMIC_MIN_F64>;
|
||||
defm FLAT_ATOMIC_MAX_F64 : FLAT_Real_Atomics_gfx940<0x51, FLAT_ATOMIC_MAX_F64>;
|
||||
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_gfx940<0x4f>;
|
||||
defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_gfx940<0x50>;
|
||||
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_gfx940<0x51>;
|
||||
defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_vi<0x4d, FLAT_ATOMIC_ADD_F32>;
|
||||
defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Real_Atomics_vi<0x4e, FLAT_ATOMIC_PK_ADD_F16>;
|
||||
defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Real_Atomics_vi<0x52, FLAT_ATOMIC_PK_ADD_BF16>;
|
||||
defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>;
|
||||
} // End SubtargetPredicate = isGFX940Plus
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// GFX10.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -1270,7 +1270,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
|||
case Intrinsic::amdgcn_global_atomic_fmax:
|
||||
case Intrinsic::amdgcn_flat_atomic_fadd:
|
||||
case Intrinsic::amdgcn_flat_atomic_fmin:
|
||||
case Intrinsic::amdgcn_flat_atomic_fmax: {
|
||||
case Intrinsic::amdgcn_flat_atomic_fmax:
|
||||
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
|
||||
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
|
||||
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
||||
Info.memVT = MVT::getVT(CI.getType());
|
||||
Info.ptrVal = CI.getOperand(0);
|
||||
|
@ -1326,6 +1328,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
|
|||
case Intrinsic::amdgcn_flat_atomic_fadd:
|
||||
case Intrinsic::amdgcn_flat_atomic_fmin:
|
||||
case Intrinsic::amdgcn_flat_atomic_fmax:
|
||||
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
|
||||
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
|
||||
case Intrinsic::amdgcn_global_atomic_csub: {
|
||||
Value *Ptr = II->getArgOperand(0);
|
||||
AccessTy = II->getType();
|
||||
|
@ -12448,6 +12452,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
|
|||
|
||||
if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
|
||||
Subtarget->hasAtomicFaddInsts()) {
|
||||
if (Subtarget->hasGFX940Insts())
|
||||
return AtomicExpansionKind::None;
|
||||
|
||||
// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
|
||||
// floating point atomic instructions. May generate more efficient code,
|
||||
// but may not respect rounding and denormal modes, and may give incorrect
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
|
||||
|
||||
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data)
|
||||
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data)
|
||||
|
||||
; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
|
||||
declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data)
|
||||
declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data)
|
||||
declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3) * %ptr, <2 x half> %data, i32, i32, i1)
|
||||
declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3) * %ptr, <2 x i16> %data)
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_f32_noret(float* %ptr, float %data) {
|
||||
; GFX940-LABEL: flat_atomic_fadd_f32_noret:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
|
||||
; GFX940-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
|
||||
; GFX940-NEXT: s_endpgm
|
||||
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
define float @flat_atomic_fadd_f32_rtn(float* %ptr, float %data) {
|
||||
; GFX940-LABEL: flat_atomic_fadd_f32_rtn:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(<2 x half>* %ptr, <2 x half> %data) {
|
||||
; GFX940-LABEL: flat_atomic_fadd_v2f16_noret:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
|
||||
; GFX940-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
|
||||
; GFX940-NEXT: s_endpgm
|
||||
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
define <2 x half> @flat_atomic_fadd_v2f16_rtn(<2 x half>* %ptr, <2 x half> %data) {
|
||||
; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data)
|
||||
ret <2 x half> %ret
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)* %ptr, <2 x half> %data) {
|
||||
; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24
|
||||
; GFX940-NEXT: s_load_dword s3, s[0:1], 0x28
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX940-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX940-NEXT: ds_pk_add_f16 v0, v1
|
||||
; GFX940-NEXT: s_endpgm
|
||||
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define <2 x half> @local_atomic_fadd_v2f16_rtn(<2 x half> addrspace(3)* %ptr, <2 x half> %data) {
|
||||
; GFX940-LABEL: local_atomic_fadd_v2f16_rtn:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
|
||||
ret <2 x half> %ret
|
||||
}
|
|
@ -0,0 +1,224 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
|
||||
|
||||
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data)
|
||||
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data)
|
||||
|
||||
; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
|
||||
declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data)
|
||||
declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data)
|
||||
declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3) * %ptr, <2 x half> %data, i32, i32, i1)
|
||||
declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3) * %ptr, <2 x i16> %data)
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_f32_noret(float* %ptr, float %data) {
|
||||
; GFX940-LABEL: flat_atomic_fadd_f32_noret:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
|
||||
; GFX940-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
|
||||
; GFX940-NEXT: s_endpgm
|
||||
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(float* %ptr) {
|
||||
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; GFX940-NEXT: buffer_wbl2
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: buffer_invl2
|
||||
; GFX940-NEXT: buffer_wbinvl1_vol
|
||||
; GFX940-NEXT: s_endpgm
|
||||
%ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(float* %ptr) #0 {
|
||||
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; GFX940-NEXT: buffer_wbl2
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: buffer_invl2
|
||||
; GFX940-NEXT: buffer_wbinvl1_vol
|
||||
; GFX940-NEXT: s_endpgm
|
||||
%ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define float @flat_atomic_fadd_f32_rtn(float* %ptr, float %data) {
|
||||
; GFX940-LABEL: flat_atomic_fadd_f32_rtn:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define float @flat_atomic_fadd_f32_rtn_pat(float* %ptr, float %data) {
|
||||
; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
|
||||
; GFX940-NEXT: buffer_wbl2
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: buffer_invl2
|
||||
; GFX940-NEXT: buffer_wbinvl1_vol
|
||||
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(<2 x half>* %ptr, <2 x half> %data) {
|
||||
; GFX940-LABEL: flat_atomic_fadd_v2f16_noret:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
|
||||
; GFX940-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
|
||||
; GFX940-NEXT: s_endpgm
|
||||
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
define <2 x half> @flat_atomic_fadd_v2f16_rtn(<2 x half>* %ptr, <2 x half> %data) {
|
||||
; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data)
|
||||
ret <2 x half> %ret
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(<2 x i16>* %ptr, <2 x i16> %data) {
|
||||
; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
|
||||
; GFX940-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
|
||||
; GFX940-NEXT: s_endpgm
|
||||
%ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(<2 x i16>* %ptr, <2 x i16> %data) {
|
||||
; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data)
|
||||
ret <2 x i16> %ret
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) {
|
||||
; GFX940-LABEL: global_atomic_fadd_v2bf16_noret:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
|
||||
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX940-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[2:3]
|
||||
; GFX940-NEXT: s_endpgm
|
||||
%ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
define <2 x i16> @global_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) {
|
||||
; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data)
|
||||
ret <2 x i16> %ret
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)* %ptr, <2 x half> %data) {
|
||||
; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24
|
||||
; GFX940-NEXT: s_load_dword s3, s[0:1], 0x28
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX940-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX940-NEXT: ds_pk_add_f16 v0, v1
|
||||
; GFX940-NEXT: s_endpgm
|
||||
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define <2 x half> @local_atomic_fadd_v2f16_rtn(<2 x half> addrspace(3)* %ptr, <2 x half> %data) {
|
||||
; GFX940-LABEL: local_atomic_fadd_v2f16_rtn:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
|
||||
ret <2 x half> %ret
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) {
|
||||
; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24
|
||||
; GFX940-NEXT: s_load_dword s3, s[0:1], 0x28
|
||||
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX940-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX940-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX940-NEXT: buffer_wbl2
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: ds_pk_add_bf16 v0, v1
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: buffer_invl2
|
||||
; GFX940-NEXT: buffer_wbinvl1_vol
|
||||
; GFX940-NEXT: s_endpgm
|
||||
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
define <2 x i16> @local_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) {
|
||||
; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn:
|
||||
; GFX940: ; %bb.0:
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: buffer_wbl2
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
|
||||
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX940-NEXT: buffer_invl2
|
||||
; GFX940-NEXT: buffer_wbinvl1_vol
|
||||
; GFX940-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data)
|
||||
ret <2 x i16> %ret
|
||||
}
|
||||
|
||||
attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" }
|
|
@ -1,6 +1,6 @@
|
|||
// RUN: llvm-mc -arch=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s
|
||||
// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX90A --implicit-check-not=error: %s
|
||||
// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940 --implicit-check-not=error: %s
|
||||
// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX10 --implicit-check-not=error: %s
|
||||
|
||||
// NOT-GFX940: error: invalid operand for instruction
|
||||
// GFX940: global_load_dword v2, v[2:3], off sc0 ; encoding: [0x00,0x80,0x51,0xdc,0x02,0x00,0x7f,0x02]
|
||||
|
@ -33,6 +33,102 @@ s_load_dword s2, s[2:3], 0x0 glc
|
|||
// GFX940: buffer_load_dword v5, off, s[8:11], s3 sc0 nt sc1 ; encoding: [0x00,0xc0,0x52,0xe0,0x00,0x05,0x02,0x03]
|
||||
buffer_load_dword v5, off, s[8:11], s3 sc0 nt sc1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_add_f32 v[2:3], v1 ; encoding: [0x00,0x00,0x34,0xdd,0x02,0x01,0x00,0x00]
|
||||
flat_atomic_add_f32 v[2:3], v1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_add_f32 v[2:3], a1 ; encoding: [0x00,0x00,0x34,0xdd,0x02,0x01,0x80,0x00]
|
||||
flat_atomic_add_f32 v[2:3], a1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_add_f32 v4, v[2:3], v1 sc0 ; encoding: [0x00,0x00,0x35,0xdd,0x02,0x01,0x00,0x04]
|
||||
flat_atomic_add_f32 v4, v[2:3], v1 sc0
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_add_f32 a4, v[2:3], a1 sc0 ; encoding: [0x00,0x00,0x35,0xdd,0x02,0x01,0x80,0x04]
|
||||
flat_atomic_add_f32 a4, v[2:3], a1 sc0
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_pk_add_f16 v4, v[2:3], v1 sc0 ; encoding: [0x00,0x00,0x39,0xdd,0x02,0x01,0x00,0x04]
|
||||
flat_atomic_pk_add_f16 v4, v[2:3], v1 sc0
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_pk_add_f16 a4, v[2:3], a1 sc0 ; encoding: [0x00,0x00,0x39,0xdd,0x02,0x01,0x80,0x04]
|
||||
flat_atomic_pk_add_f16 a4, v[2:3], a1 sc0
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_pk_add_f16 v[2:3], v1 ; encoding: [0x00,0x00,0x38,0xdd,0x02,0x01,0x00,0x00]
|
||||
flat_atomic_pk_add_f16 v[2:3], v1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_pk_add_f16 v[2:3], a1 ; encoding: [0x00,0x00,0x38,0xdd,0x02,0x01,0x80,0x00]
|
||||
flat_atomic_pk_add_f16 v[2:3], a1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_pk_add_bf16 v4, v[2:3], v1 sc0 ; encoding: [0x00,0x00,0x49,0xdd,0x02,0x01,0x00,0x04]
|
||||
flat_atomic_pk_add_bf16 v4, v[2:3], v1 sc0
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_pk_add_bf16 a4, v[2:3], a1 sc0 ; encoding: [0x00,0x00,0x49,0xdd,0x02,0x01,0x80,0x04]
|
||||
flat_atomic_pk_add_bf16 a4, v[2:3], a1 sc0
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_pk_add_bf16 v[2:3], v1 ; encoding: [0x00,0x00,0x48,0xdd,0x02,0x01,0x00,0x00]
|
||||
flat_atomic_pk_add_bf16 v[2:3], v1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_pk_add_bf16 v[2:3], a1 ; encoding: [0x00,0x00,0x48,0xdd,0x02,0x01,0x80,0x00]
|
||||
flat_atomic_pk_add_bf16 v[2:3], a1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: global_atomic_pk_add_bf16 v4, v[2:3], v1, off sc0 ; encoding: [0x00,0x80,0x49,0xdd,0x02,0x01,0x7f,0x04]
|
||||
global_atomic_pk_add_bf16 v4, v[2:3], v1, off sc0
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: global_atomic_pk_add_bf16 a4, v[2:3], a1, off sc0 ; encoding: [0x00,0x80,0x49,0xdd,0x02,0x01,0xff,0x04]
|
||||
global_atomic_pk_add_bf16 a4, v[2:3], a1, off sc0
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: global_atomic_pk_add_bf16 v[2:3], v1, off ; encoding: [0x00,0x80,0x48,0xdd,0x02,0x01,0x7f,0x00]
|
||||
global_atomic_pk_add_bf16 v[2:3], v1, off
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: global_atomic_pk_add_bf16 v[2:3], a1, off ; encoding: [0x00,0x80,0x48,0xdd,0x02,0x01,0xff,0x00]
|
||||
global_atomic_pk_add_bf16 v[2:3], a1, off
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: ds_pk_add_f16 v2, v1 ; encoding: [0x00,0x00,0x2e,0xd8,0x02,0x01,0x00,0x00]
|
||||
ds_pk_add_f16 v2, v1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: ds_pk_add_f16 v2, a1 ; encoding: [0x00,0x00,0x2e,0xda,0x02,0x01,0x00,0x00]
|
||||
ds_pk_add_f16 v2, a1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: ds_pk_add_rtn_f16 v3, v2, v1 ; encoding: [0x00,0x00,0x6e,0xd9,0x02,0x01,0x00,0x03]
|
||||
ds_pk_add_rtn_f16 v3, v2, v1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: ds_pk_add_rtn_f16 a3, v2, a1 ; encoding: [0x00,0x00,0x6e,0xdb,0x02,0x01,0x00,0x03]
|
||||
ds_pk_add_rtn_f16 a3, v2, a1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: ds_pk_add_bf16 v2, v1 ; encoding: [0x00,0x00,0x30,0xd8,0x02,0x01,0x00,0x00]
|
||||
ds_pk_add_bf16 v2, v1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: ds_pk_add_bf16 v2, a1 ; encoding: [0x00,0x00,0x30,0xda,0x02,0x01,0x00,0x00]
|
||||
ds_pk_add_bf16 v2, a1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: ds_pk_add_rtn_bf16 v3, v2, v1 ; encoding: [0x00,0x00,0x70,0xd9,0x02,0x01,0x00,0x03]
|
||||
ds_pk_add_rtn_bf16 v3, v2, v1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: ds_pk_add_rtn_bf16 a3, v2, a1 ; encoding: [0x00,0x00,0x70,0xdb,0x02,0x01,0x00,0x03]
|
||||
ds_pk_add_rtn_bf16 a3, v2, a1
|
||||
|
||||
// NOT-GFX940: error: instruction not supported on this GPU
|
||||
// GFX940: v_mov_b64_e32 v[2:3], v[4:5] ; encoding: [0x04,0x71,0x04,0x7e]
|
||||
v_mov_b64 v[2:3], v[4:5]
|
||||
|
@ -68,3 +164,63 @@ v_fmamk_f32 v0, v2, 100.0, v3
|
|||
// GFX90A: error: instruction not supported on this GPU
|
||||
// GFX940: v_fmaak_f32 v0, v2, v3, 0x42c80000 ; encoding: [0x02,0x07,0x00,0x30,0x00,0x00,0xc8,0x42]
|
||||
v_fmaak_f32 v0, v2, v3, 100.0
|
||||
|
||||
// GFX90A: error: invalid operand for instruction
|
||||
// GFX10: error: instruction not supported on this GPU
|
||||
// GFX940: global_atomic_add_f32 v0, v[0:1], v2, off sc0 sc1 ; encoding: [0x00,0x80,0x35,0xdf,0x00,0x02,0x7f,0x00]
|
||||
global_atomic_add_f32 v0, v[0:1], v2, off sc0 sc1
|
||||
|
||||
// GFX90A: error: invalid operand for instruction
|
||||
// GFX10: error: instruction not supported on this GPU
|
||||
// GFX940: global_atomic_add_f32 v[0:1], v2, off sc1 ; encoding: [0x00,0x80,0x34,0xdf,0x00,0x02,0x7f,0x00]
|
||||
global_atomic_add_f32 v[0:1], v2, off sc1
|
||||
|
||||
// GFX90A: error: invalid operand for instruction
|
||||
// GFX10: error: instruction not supported on this GPU
|
||||
// GFX940: global_atomic_add_f32 v0, v2, s[0:1] sc1 ; encoding: [0x00,0x80,0x34,0xdf,0x00,0x02,0x00,0x00]
|
||||
global_atomic_add_f32 v0, v2, s[0:1] sc1
|
||||
|
||||
// GFX90A: error: invalid operand for instruction
|
||||
// GFX10: error: instruction not supported on this GPU
|
||||
// GFX940: global_atomic_add_f32 v1, v0, v2, s[0:1] sc0 sc1 ; encoding: [0x00,0x80,0x35,0xdf,0x00,0x02,0x00,0x01]
|
||||
global_atomic_add_f32 v1, v0, v2, s[0:1] sc0 sc1
|
||||
|
||||
// GFX90A: error: invalid operand for instruction
|
||||
// GFX10: error: instruction not supported on this GPU
|
||||
// GFX940: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 sc1 ; encoding: [0x00,0x80,0x39,0xdf,0x00,0x02,0x7f,0x00]
|
||||
global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 sc1
|
||||
|
||||
// GFX90A: error: invalid operand for instruction
|
||||
// GFX10: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 ; encoding: [0x00,0x00,0x3d,0xdf,0x00,0x02,0x00,0x00]
|
||||
flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1
|
||||
|
||||
// GFX90A: error: invalid operand for instruction
|
||||
// GFX10: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_add_f64 v[0:1], v[2:3] sc1 ; encoding: [0x00,0x00,0x3c,0xdf,0x00,0x02,0x00,0x00]
|
||||
flat_atomic_add_f64 v[0:1], v[2:3] sc1
|
||||
|
||||
// GFX90A: error: invalid operand for instruction
|
||||
// GFX10: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_min_f64 v[0:1], v[2:3] sc1 ; encoding: [0x00,0x00,0x40,0xdf,0x00,0x02,0x00,0x00]
|
||||
flat_atomic_min_f64 v[0:1], v[2:3] sc1
|
||||
|
||||
// GFX90A: error: invalid operand for instruction
|
||||
// GFX10: error: instruction not supported on this GPU
|
||||
// GFX940: flat_atomic_max_f64 v[0:1], v[2:3] sc1 ; encoding: [0x00,0x00,0x44,0xdf,0x00,0x02,0x00,0x00]
|
||||
flat_atomic_max_f64 v[0:1], v[2:3] sc1
|
||||
|
||||
// GFX90A: error: invalid operand for instruction
|
||||
// GFX10: error: instruction not supported on this GPU
|
||||
// GFX940: global_atomic_add_f64 v[0:1], v[2:3], off sc1 ; encoding: [0x00,0x80,0x3c,0xdf,0x00,0x02,0x7f,0x00]
|
||||
global_atomic_add_f64 v[0:1], v[2:3], off sc1
|
||||
|
||||
// GFX90A: error: invalid operand for instruction
|
||||
// GFX10: error: instruction not supported on this GPU
|
||||
// GFX940: global_atomic_min_f64 v[0:1], v[2:3], off sc1 ; encoding: [0x00,0x80,0x40,0xdf,0x00,0x02,0x7f,0x00]
|
||||
global_atomic_min_f64 v[0:1], v[2:3], off sc1
|
||||
|
||||
// GFX90A: error: invalid operand for instruction
|
||||
// GFX10: error: instruction not supported on this GPU
|
||||
// GFX940: global_atomic_max_f64 v[0:1], v[2:3], off sc1 ; encoding: [0x00,0x80,0x44,0xdf,0x00,0x02,0x7f,0x00]
|
||||
global_atomic_max_f64 v[0:1], v[2:3], off sc1
|
||||
|
|
|
@ -15,6 +15,78 @@
|
|||
# GFX940: buffer_load_dword v5, off, s[8:11], s3 sc0 nt sc1 ; encoding: [0x00,0xc0,0x52,0xe0,0x00,0x05,0x02,0x03]
|
||||
0x00,0xc0,0x52,0xe0,0x00,0x05,0x02,0x03
|
||||
|
||||
# GFX940: flat_atomic_add_f32 v[2:3], v1 ; encoding: [0x00,0x00,0x34,0xdd,0x02,0x01,0x00,0x00]
|
||||
0x00,0x00,0x34,0xdd,0x02,0x01,0x00,0x00
|
||||
|
||||
# GFX940: flat_atomic_add_f32 v[2:3], a1 ; encoding: [0x00,0x00,0x34,0xdd,0x02,0x01,0x80,0x00]
|
||||
0x00,0x00,0x34,0xdd,0x02,0x01,0x80,0x00
|
||||
|
||||
# GFX940: flat_atomic_add_f32 v4, v[2:3], v1 sc0 ; encoding: [0x00,0x00,0x35,0xdd,0x02,0x01,0x00,0x04]
|
||||
0x00,0x00,0x35,0xdd,0x02,0x01,0x00,0x04
|
||||
|
||||
# GFX940: flat_atomic_add_f32 a4, v[2:3], a1 sc0 ; encoding: [0x00,0x00,0x35,0xdd,0x02,0x01,0x80,0x04]
|
||||
0x00,0x00,0x35,0xdd,0x02,0x01,0x80,0x04
|
||||
|
||||
# GFX940: flat_atomic_pk_add_f16 v4, v[2:3], v1 sc0 ; encoding: [0x00,0x00,0x39,0xdd,0x02,0x01,0x00,0x04]
|
||||
0x00,0x00,0x39,0xdd,0x02,0x01,0x00,0x04
|
||||
|
||||
# GFX940: flat_atomic_pk_add_f16 a4, v[2:3], a1 sc0 ; encoding: [0x00,0x00,0x39,0xdd,0x02,0x01,0x80,0x04]
|
||||
0x00,0x00,0x39,0xdd,0x02,0x01,0x80,0x04
|
||||
|
||||
# GFX940: flat_atomic_pk_add_f16 v[2:3], v1 ; encoding: [0x00,0x00,0x38,0xdd,0x02,0x01,0x00,0x00]
|
||||
0x00,0x00,0x38,0xdd,0x02,0x01,0x00,0x00
|
||||
|
||||
# GFX940: flat_atomic_pk_add_f16 v[2:3], a1 ; encoding: [0x00,0x00,0x38,0xdd,0x02,0x01,0x80,0x00]
|
||||
0x00,0x00,0x38,0xdd,0x02,0x01,0x80,0x00
|
||||
|
||||
# GFX940: flat_atomic_pk_add_bf16 v4, v[2:3], v1 sc0 ; encoding: [0x00,0x00,0x49,0xdd,0x02,0x01,0x00,0x04]
|
||||
0x00,0x00,0x49,0xdd,0x02,0x01,0x00,0x04
|
||||
|
||||
# GFX940: flat_atomic_pk_add_bf16 a4, v[2:3], a1 sc0 ; encoding: [0x00,0x00,0x49,0xdd,0x02,0x01,0x80,0x04]
|
||||
0x00,0x00,0x49,0xdd,0x02,0x01,0x80,0x04
|
||||
|
||||
# GFX940: flat_atomic_pk_add_bf16 v[2:3], v1 ; encoding: [0x00,0x00,0x48,0xdd,0x02,0x01,0x00,0x00]
|
||||
0x00,0x00,0x48,0xdd,0x02,0x01,0x00,0x00
|
||||
|
||||
# GFX940: flat_atomic_pk_add_bf16 v[2:3], a1 ; encoding: [0x00,0x00,0x48,0xdd,0x02,0x01,0x80,0x00]
|
||||
0x00,0x00,0x48,0xdd,0x02,0x01,0x80,0x00
|
||||
|
||||
# GFX940: global_atomic_pk_add_bf16 v4, v[2:3], v1, off sc0 ; encoding: [0x00,0x80,0x49,0xdd,0x02,0x01,0x7f,0x04]
|
||||
0x00,0x80,0x49,0xdd,0x02,0x01,0x7f,0x04
|
||||
|
||||
# GFX940: global_atomic_pk_add_bf16 a4, v[2:3], a1, off sc0 ; encoding: [0x00,0x80,0x49,0xdd,0x02,0x01,0xff,0x04]
|
||||
0x00,0x80,0x49,0xdd,0x02,0x01,0xff,0x04
|
||||
|
||||
# GFX940: global_atomic_pk_add_bf16 v[2:3], v1, off ; encoding: [0x00,0x80,0x48,0xdd,0x02,0x01,0x7f,0x00]
|
||||
0x00,0x80,0x48,0xdd,0x02,0x01,0x7f,0x00
|
||||
|
||||
# GFX940: global_atomic_pk_add_bf16 v[2:3], a1, off ; encoding: [0x00,0x80,0x48,0xdd,0x02,0x01,0xff,0x00]
|
||||
0x00,0x80,0x48,0xdd,0x02,0x01,0xff,0x00
|
||||
|
||||
# GFX940: ds_pk_add_f16 v2, v1 ; encoding: [0x00,0x00,0x2e,0xd8,0x02,0x01,0x00,0x00]
|
||||
0x00,0x00,0x2e,0xd8,0x02,0x01,0x00,0x00
|
||||
|
||||
# GFX940: ds_pk_add_f16 v2, a1 ; encoding: [0x00,0x00,0x2e,0xda,0x02,0x01,0x00,0x00]
|
||||
0x00,0x00,0x2e,0xda,0x02,0x01,0x00,0x00
|
||||
|
||||
# GFX940: ds_pk_add_rtn_f16 v3, v2, v1 ; encoding: [0x00,0x00,0x6e,0xd9,0x02,0x01,0x00,0x03]
|
||||
0x00,0x00,0x6e,0xd9,0x02,0x01,0x00,0x03
|
||||
|
||||
# GFX940: ds_pk_add_rtn_f16 a3, v2, a1 ; encoding: [0x00,0x00,0x6e,0xdb,0x02,0x01,0x00,0x03]
|
||||
0x00,0x00,0x6e,0xdb,0x02,0x01,0x00,0x03
|
||||
|
||||
# GFX940: ds_pk_add_bf16 v2, v1 ; encoding: [0x00,0x00,0x30,0xd8,0x02,0x01,0x00,0x00]
|
||||
0x00,0x00,0x30,0xd8,0x02,0x01,0x00,0x00
|
||||
|
||||
# GFX940: ds_pk_add_bf16 v2, a1 ; encoding: [0x00,0x00,0x30,0xda,0x02,0x01,0x00,0x00]
|
||||
0x00,0x00,0x30,0xda,0x02,0x01,0x00,0x00
|
||||
|
||||
# GFX940: ds_pk_add_rtn_bf16 v3, v2, v1 ; encoding: [0x00,0x00,0x70,0xd9,0x02,0x01,0x00,0x03]
|
||||
0x00,0x00,0x70,0xd9,0x02,0x01,0x00,0x03
|
||||
|
||||
# GFX940: ds_pk_add_rtn_bf16 a3, v2, a1 ; encoding: [0x00,0x00,0x70,0xdb,0x02,0x01,0x00,0x03]
|
||||
0x00,0x00,0x70,0xdb,0x02,0x01,0x00,0x03
|
||||
|
||||
# GFX940: v_mov_b64_e32 v[2:3], v[4:5] ; encoding: [0x04,0x71,0x04,0x7e]
|
||||
0x04,0x71,0x04,0x7e
|
||||
|
||||
|
@ -41,3 +113,39 @@
|
|||
|
||||
# GFX940: v_fmaak_f32 v0, v2, v3, 0x42c80000 ; encoding: [0x02,0x07,0x00,0x30,0x00,0x00,0xc8,0x42]
|
||||
0x02,0x07,0x00,0x30,0x00,0x00,0xc8,0x42
|
||||
|
||||
# GFX940: global_atomic_add_f32 v0, v[0:1], v2, off sc0 sc1 ; encoding: [0x00,0x80,0x35,0xdf,0x00,0x02,0x7f,0x00]
|
||||
0x00,0x80,0x35,0xdf,0x00,0x02,0x7f,0x00
|
||||
|
||||
# GFX940: global_atomic_add_f32 v[0:1], v2, off sc1 ; encoding: [0x00,0x80,0x34,0xdf,0x00,0x02,0x7f,0x00]
|
||||
0x00,0x80,0x34,0xdf,0x00,0x02,0x7f,0x00
|
||||
|
||||
# GFX940: global_atomic_add_f32 v0, v2, s[0:1] sc1 ; encoding: [0x00,0x80,0x34,0xdf,0x00,0x02,0x00,0x00]
|
||||
0x00,0x80,0x34,0xdf,0x00,0x02,0x00,0x00
|
||||
|
||||
# GFX940: global_atomic_add_f32 v1, v0, v2, s[0:1] sc0 sc1 ; encoding: [0x00,0x80,0x35,0xdf,0x00,0x02,0x00,0x01]
|
||||
0x00,0x80,0x35,0xdf,0x00,0x02,0x00,0x01
|
||||
|
||||
# GFX940: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 sc1 ; encoding: [0x00,0x80,0x39,0xdf,0x00,0x02,0x7f,0x00]
|
||||
0x00,0x80,0x39,0xdf,0x00,0x02,0x7f,0x00
|
||||
|
||||
# GFX940: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 ; encoding: [0x00,0x00,0x3d,0xdf,0x00,0x02,0x00,0x00]
|
||||
0x00,0x00,0x3d,0xdf,0x00,0x02,0x00,0x00
|
||||
|
||||
# GFX940: flat_atomic_add_f64 v[0:1], v[2:3] sc1 ; encoding: [0x00,0x00,0x3c,0xdf,0x00,0x02,0x00,0x00]
|
||||
0x00,0x00,0x3c,0xdf,0x00,0x02,0x00,0x00
|
||||
|
||||
# GFX940: flat_atomic_min_f64 v[0:1], v[2:3] sc1 ; encoding: [0x00,0x00,0x40,0xdf,0x00,0x02,0x00,0x00]
|
||||
0x00,0x00,0x40,0xdf,0x00,0x02,0x00,0x00
|
||||
|
||||
# GFX940: flat_atomic_max_f64 v[0:1], v[2:3] sc1 ; encoding: [0x00,0x00,0x44,0xdf,0x00,0x02,0x00,0x00]
|
||||
0x00,0x00,0x44,0xdf,0x00,0x02,0x00,0x00
|
||||
|
||||
# GFX940: global_atomic_add_f64 v[0:1], v[2:3], off sc1 ; encoding: [0x00,0x80,0x3c,0xdf,0x00,0x02,0x7f,0x00]
|
||||
0x00,0x80,0x3c,0xdf,0x00,0x02,0x7f,0x00
|
||||
|
||||
# GFX940: global_atomic_min_f64 v[0:1], v[2:3], off sc1 ; encoding: [0x00,0x80,0x40,0xdf,0x00,0x02,0x7f,0x00]
|
||||
0x00,0x80,0x40,0xdf,0x00,0x02,0x7f,0x00
|
||||
|
||||
# GFX940: global_atomic_max_f64 v[0:1], v[2:3], off sc1 ; encoding: [0x00,0x80,0x44,0xdf,0x00,0x02,0x7f,0x00]
|
||||
0x00,0x80,0x44,0xdf,0x00,0x02,0x7f,0x00
|
||||
|
|
Loading…
Reference in New Issue