# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s --- # The loop contains a store and a use of a value loaded outside of the loop. # We expect the waitcnt for the use to be hoisted on GFX9, but not on GFX10+ # because we have the vscnt counter. # GFX9-LABEL: waitcnt_vm_loop # GFX9-LABEL: bb.0: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop # GFX10-LABEL: bb.0: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: name: waitcnt_vm_loop body: | bb.0: successors: %bb.1 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.1, %bb.2 BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 bb.2: S_ENDPGM 0 ... --- # Same as before, but the loop preheader has no terminator. # GFX9-LABEL: waitcnt_vm_loop_noterm # GFX9-LABEL: bb.0: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop_noterm # GFX10-LABEL: bb.0: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: name: waitcnt_vm_loop_noterm body: | bb.0: successors: %bb.1 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec bb.1: successors: %bb.1, %bb.2 BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 bb.2: S_ENDPGM 0 ... --- # Same as before but there is a preexisting waitcnt in the preheader. # GFX9-LABEL: waitcnt_vm_loop_noterm_wait # GFX9-LABEL: bb.0: # GFX9: S_WAITCNT 39 # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.2: name: waitcnt_vm_loop_noterm_wait body: | bb.0: successors: %bb.1 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_WAITCNT 3952 bb.1: successors: %bb.1, %bb.2 BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 bb.2: S_ENDPGM 0 ... --- # The loop contains a store, a load, and uses values loaded both inside and # outside the loop. # We do not expect the waitcnt to be hoisted out of the loop. # GFX9-LABEL: waitcnt_vm_loop_load # GFX9-LABEL: bb.0: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop_load # GFX10-LABEL: bb.0: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: name: waitcnt_vm_loop_load body: | bb.0: successors: %bb.1 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.1, %bb.2 BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec $vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 bb.2: S_ENDPGM 0 ... --- # The loop contains a use of a value loaded outside of the loop, and no store # nor load. # We do not expect the waitcnt to be hoisted out of the loop. # GFX9-LABEL: waitcnt_vm_loop_no_store # GFX9-LABEL: bb.0: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop_no_store # GFX10-LABEL: bb.0: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: name: waitcnt_vm_loop_no_store body: | bb.0: successors: %bb.1 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.1, %bb.2 $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 bb.2: S_ENDPGM 0 ... --- # The loop contains a store, no load, and doesn't use any value loaded inside # or outside of the loop. There is only one use of the loaded value in the # exit block. # We don't expect any s_waitcnt vmcnt in the loop body or preheader, but expect # one in the exit block. # GFX9-LABEL: waitcnt_vm_loop_no_use # GFX9-LABEL: bb.0: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop_no_use # GFX10-LABEL: bb.0: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: name: waitcnt_vm_loop_no_use body: | bb.0: successors: %bb.1 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.1, %bb.2 BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 bb.2: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec S_ENDPGM 0 ... --- # The loop loads a value that is not used in the loop, and uses a value loaded # outside of the loop. # We expect the waitcnt to be hoisted of the loop to wait a single time before # the loop is executed and avoid waiting for the load to complete on each # iteration. # GFX9-LABEL: waitcnt_vm_loop2 # GFX9-LABEL: bb.0: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop2 # GFX10-LABEL: bb.0: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: name: waitcnt_vm_loop2 body: | bb.0: successors: %bb.1 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.1, %bb.2 $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 bb.2: S_ENDPGM 0 ... --- # Same as before with an additional store in the loop. We still expect the # waitcnt instructions to be hoisted. # GFX9-LABEL: waitcnt_vm_loop2_store # GFX9-LABEL: bb.0: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop2_store # GFX10-LABEL: bb.0: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: name: waitcnt_vm_loop2_store body: | bb.0: successors: %bb.1 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.1, %bb.2 $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 bb.2: S_ENDPGM 0 ... --- # Same as loop2 but the value loaded inside the loop is also used in the loop. # We do not expect the waitcnt to be hoisted out of the loop. # GFX9-LABEL: waitcnt_vm_loop2_use_in_loop # GFX9-LABEL: bb.0: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop2_use_in_loop # GFX10-LABEL: bb.0: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: name: waitcnt_vm_loop2_use_in_loop body: | bb.0: successors: %bb.1 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.1, %bb.2 $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec $vgpr4 = V_ADD_U32_e32 $vgpr5, $vgpr1, implicit $exec S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 bb.2: S_ENDPGM 0 ... --- # The loop contains a use of a value loaded outside of the loop, but we already # waited for that load to complete. The loop also loads a value that is not used # in the loop. We do not expect any waitcnt in the loop. # GFX9-LABEL: waitcnt_vm_loop2_nowait # GFX9-LABEL: bb.0: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.3: # GFX10-LABEL: waitcnt_vm_loop2_nowait # GFX10-LABEL: bb.0: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.3: name: waitcnt_vm_loop2_nowait body: | bb.0: successors: %bb.1 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.2 $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec S_BRANCH %bb.2 bb.2: successors: %bb.2, %bb.3 $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc S_BRANCH %bb.3 bb.3: S_ENDPGM 0 ... --- # Similar test case but for register intervals. # GFX9-LABEL: waitcnt_vm_loop2_reginterval # GFX9-LABEL: bb.0: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop2_reginterval # GFX10-LABEL: bb.0: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: name: waitcnt_vm_loop2_reginterval body: | bb.0: successors: %bb.1 $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.1, %bb.2 $vgpr10 = COPY $vgpr0 $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 bb.2: S_ENDPGM 0 ... --- # Similar test case but for register intervals. # GFX9-LABEL: waitcnt_vm_loop2_reginterval2 # GFX9-LABEL: bb.0: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop2_reginterval2 # GFX10-LABEL: bb.0: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: name: waitcnt_vm_loop2_reginterval2 body: | bb.0: successors: %bb.1 $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.1, %bb.2 $vgpr10 = COPY $vgpr0 $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) $vgpr11 = COPY $vgpr7 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 bb.2: S_ENDPGM 0 ... --- # The loop loads a value that is not used in the loop, but uses a value loaded # outside of it. We expect the s_waitcnt instruction to be hoisted. # A s_waitcnt vmcnt(0) is generated to flush in the preheader, but for this # specific test case, it would be better to use vmcnt(1) instead. This is # currently not implemented. # GFX9-LABEL: waitcnt_vm_zero # GFX9-LABEL: bb.0: # GFX9: S_WAITCNT 3952 # GFX9-LABEL: bb.1: # GFX9-NOT: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_zero # GFX10-LABEL: bb.0: # GFX10: S_WAITCNT 16240 # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16240 # GFX10-LABEL: bb.2: name: waitcnt_vm_zero body: | bb.0: successors: %bb.1 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.1, %bb.2 $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr3, implicit $exec $vgpr2 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr3, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 bb.2: S_ENDPGM 0 ...