[AMDGPU] Add GFX11 test coverage for the memory legalizer

This commit is contained in:
Jay Foad 2022-06-09 15:00:49 +01:00
parent a33983729d
commit a3fc8adb7e
24 changed files with 38108 additions and 0 deletions

View File

@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @singlethread_acquire_fence() {
; GFX6-LABEL: singlethread_acquire_fence:
@ -45,6 +47,14 @@ define amdgpu_kernel void @singlethread_acquire_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_acquire_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: singlethread_acquire_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: singlethread_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread") acquire
ret void
@ -86,6 +96,14 @@ define amdgpu_kernel void @singlethread_release_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_release_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: singlethread_release_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: singlethread_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread") release
ret void
@ -127,6 +145,14 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_acq_rel_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: singlethread_acq_rel_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: singlethread_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread") acq_rel
ret void
@ -168,6 +194,14 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_seq_cst_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: singlethread_seq_cst_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: singlethread_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread") seq_cst
ret void
@ -209,6 +243,14 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_one_as_acquire_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: singlethread_one_as_acquire_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: singlethread_one_as_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") acquire
ret void
@ -250,6 +292,14 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_one_as_release_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: singlethread_one_as_release_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: singlethread_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") release
ret void
@ -291,6 +341,14 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: singlethread_one_as_acq_rel_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: singlethread_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") acq_rel
ret void
@ -332,6 +390,14 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: singlethread_one_as_seq_cst_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: singlethread_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") seq_cst
ret void
@ -373,6 +439,14 @@ define amdgpu_kernel void @wavefront_acquire_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_acquire_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: wavefront_acquire_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: wavefront_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront") acquire
ret void
@ -414,6 +488,14 @@ define amdgpu_kernel void @wavefront_release_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_release_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: wavefront_release_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: wavefront_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront") release
ret void
@ -455,6 +537,14 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_acq_rel_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: wavefront_acq_rel_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: wavefront_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront") acq_rel
ret void
@ -496,6 +586,14 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_seq_cst_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: wavefront_seq_cst_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: wavefront_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront") seq_cst
ret void
@ -537,6 +635,14 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_one_as_acquire_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: wavefront_one_as_acquire_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: wavefront_one_as_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") acquire
ret void
@ -578,6 +684,14 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_one_as_release_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: wavefront_one_as_release_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: wavefront_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") release
ret void
@ -619,6 +733,14 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: wavefront_one_as_acq_rel_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: wavefront_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") acq_rel
ret void
@ -660,6 +782,14 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: wavefront_one_as_seq_cst_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: wavefront_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") seq_cst
ret void
@ -714,6 +844,18 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: workgroup_acquire_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: workgroup_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire
ret void
@ -765,6 +907,17 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: workgroup_release_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: workgroup_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release
ret void
@ -819,6 +972,18 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: workgroup_acq_rel_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: workgroup_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel
ret void
@ -873,6 +1038,18 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: workgroup_seq_cst_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: workgroup_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst
ret void
@ -921,6 +1098,17 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: workgroup_one_as_acquire_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: workgroup_one_as_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire
ret void
@ -966,6 +1154,16 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: workgroup_one_as_release_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: workgroup_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release
ret void
@ -1014,6 +1212,17 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: workgroup_one_as_acq_rel_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel
ret void
@ -1062,6 +1271,17 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: workgroup_one_as_seq_cst_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst
ret void
@ -1126,6 +1346,22 @@ define amdgpu_kernel void @agent_acquire_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: agent_acquire_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: agent_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") acquire
ret void
@ -1180,6 +1416,18 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: agent_release_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: agent_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") release
ret void
@ -1244,6 +1492,22 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: agent_acq_rel_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: agent_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel
ret void
@ -1308,6 +1572,22 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: agent_seq_cst_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: agent_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst
ret void
@ -1372,6 +1652,22 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: agent_one_as_acquire_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: agent_one_as_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acquire
ret void
@ -1426,6 +1722,18 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: agent_one_as_release_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: agent_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release
ret void
@ -1490,6 +1798,22 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: agent_one_as_acq_rel_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: agent_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel
ret void
@ -1554,6 +1878,22 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: agent_one_as_seq_cst_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: agent_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst
ret void
@ -1622,6 +1962,22 @@ define amdgpu_kernel void @system_acquire_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: system_acquire_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: system_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_endpgm
entry:
fence acquire
ret void
@ -1678,6 +2034,18 @@ define amdgpu_kernel void @system_release_fence() {
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: system_release_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: system_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
entry:
fence release
ret void
@ -1746,6 +2114,22 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: system_acq_rel_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: system_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_endpgm
entry:
fence acq_rel
ret void
@ -1814,6 +2198,22 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: system_seq_cst_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: system_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_endpgm
entry:
fence seq_cst
ret void
@ -1882,6 +2282,22 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: system_one_as_acquire_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: system_one_as_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") acquire
ret void
@ -1938,6 +2354,18 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: system_one_as_release_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: system_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") release
ret void
@ -2006,6 +2434,22 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: system_one_as_acq_rel_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: system_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel
ret void
@ -2074,6 +2518,22 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: system_one_as_seq_cst_fence:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: system_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst
ret void

File diff suppressed because it is too large Load Diff

View File

@ -7,6 +7,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
@ -112,6 +114,32 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_nontemporal_load_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_load_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4, !nontemporal !0
@ -235,6 +263,34 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_nontemporal_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -348,6 +404,32 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_nontemporal_store_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4
@ -471,6 +553,34 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_nontemporal_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
@ -60,6 +62,34 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_nontemporal_load_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_load_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load volatile i32, i32* %in, align 4
@ -129,6 +159,36 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_nontemporal_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -194,6 +254,34 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_nontemporal_store_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4
@ -263,6 +351,36 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_nontemporal_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -328,6 +446,34 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_volatile_workgroup_acquire_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_volatile_workgroup_acquire_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic volatile i32, i32* %in syncscope("workgroup") acquire, align 4
@ -386,6 +532,33 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_volatile_workgroup_release_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_volatile_workgroup_release_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic volatile i32 %in, i32* %out syncscope("workgroup") release, align 4

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @global_nontemporal_load_0(
; GFX6-LABEL: global_nontemporal_load_0:
@ -115,6 +117,28 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_nontemporal_load_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_load_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0
@ -237,6 +261,28 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_nontemporal_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] slc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] slc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -352,6 +398,28 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_nontemporal_store_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_store_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@ -469,6 +537,28 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_nontemporal_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_store_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @global_volatile_load_0(
; GFX6-LABEL: global_volatile_load_0:
@ -68,6 +70,26 @@ define amdgpu_kernel void @global_volatile_load_0(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_volatile_load_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_load_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load volatile i32, i32 addrspace(1)* %in, align 4
@ -146,6 +168,28 @@ define amdgpu_kernel void @global_volatile_load_1(
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_volatile_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -222,6 +266,30 @@ define amdgpu_kernel void @global_volatile_store_0(
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_volatile_store_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_store_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@ -300,6 +368,30 @@ define amdgpu_kernel void @global_volatile_store_1(
; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_volatile_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_store_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -373,6 +465,27 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_volatile_workgroup_acquire_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_workgroup_acquire_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic volatile i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4
@ -441,6 +554,31 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_volatile_workgroup_release_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_workgroup_release_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic volatile i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @local_nontemporal_load_0(
; GFX6-LABEL: local_nontemporal_load_0:
@ -125,6 +127,32 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: local_nontemporal_load_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_nontemporal_load_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
@ -251,6 +279,32 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: local_nontemporal_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_nontemporal_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -373,6 +427,32 @@ define amdgpu_kernel void @local_nontemporal_store_0(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: local_nontemporal_store_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: ds_store_b32 v0, v1
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_nontemporal_store_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@ -496,6 +576,32 @@ define amdgpu_kernel void @local_nontemporal_store_1(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: local_nontemporal_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: ds_store_b32 v0, v1
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_nontemporal_store_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @local_volatile_load_0(
; GFX6-LABEL: local_volatile_load_0:
@ -73,6 +75,32 @@ define amdgpu_kernel void @local_volatile_load_0(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: local_volatile_load_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_volatile_load_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%val = load volatile i32, i32 addrspace(3)* %in, align 4
@ -151,6 +179,32 @@ define amdgpu_kernel void @local_volatile_load_1(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: local_volatile_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_volatile_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -225,6 +279,32 @@ define amdgpu_kernel void @local_volatile_store_0(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: local_volatile_store_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: ds_store_b32 v0, v1
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_volatile_store_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@ -300,6 +380,32 @@ define amdgpu_kernel void @local_volatile_store_1(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: local_volatile_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: ds_store_b32 v0, v1
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_volatile_store_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -369,6 +475,29 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: local_volatile_workgroup_acquire_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: ds_store_b32 v1, v0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_volatile_workgroup_acquire_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: ds_store_b32 v1, v0
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic volatile i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4
@ -431,6 +560,27 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: local_volatile_workgroup_release_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: ds_store_b32 v0, v1
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_volatile_workgroup_release_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic volatile i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @private_nontemporal_load_0(
; GFX6-LABEL: private_nontemporal_load_0:
@ -149,6 +151,30 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: private_nontemporal_load_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_load_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0
@ -301,6 +327,30 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: private_nontemporal_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -448,6 +498,30 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX940-TGSPLIT-NEXT: scratch_store_dword off, v0, s4 nt
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: private_nontemporal_store_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_store_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@ -598,6 +672,32 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, s4 nt
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: private_nontemporal_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_store_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -4,6 +4,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @private_volatile_load_0(
; GFX6-LABEL: private_volatile_load_0:
@ -93,6 +95,30 @@ define amdgpu_kernel void @private_volatile_load_0(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: private_volatile_load_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_load_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%val = load volatile i32, i32 addrspace(5)* %in, align 4
@ -191,6 +217,30 @@ define amdgpu_kernel void @private_volatile_load_1(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: private_volatile_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -291,6 +341,32 @@ define amdgpu_kernel void @private_volatile_store_0(
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: private_volatile_store_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_store_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@ -392,6 +468,34 @@ define amdgpu_kernel void @private_volatile_store_1(
; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: private_volatile_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_store_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()