Compiler projects using llvm
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,1 -verify-machineinstrs < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=prologepilog -verify-machineinstrs < %s | FileCheck -check-prefix=PEI-GFX908 %s
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,1 -verify-machineinstrs < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=prologepilog -verify-machineinstrs < %s | FileCheck -check-prefix=PEI-GFX90A %s

; Partial reg copy and spill missed during regalloc handled later at frame lowering.
define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
  ; REGALLOC-GFX908-LABEL: name: partial_copy
  ; REGALLOC-GFX908: bb.0 (%ir-block.0):
  ; REGALLOC-GFX908:  INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2949130 /* regdef:VReg_64 */, def [[VREG_64:%[0-9]+]]
  ; REGALLOC-GFX908:  SI_SPILL_V64_SAVE [[VREG_64]], %stack.0
  ; REGALLOC-GFX908:  [[V_MFMA_I32_4X4X4I8_A128:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64
  ; REGALLOC-GFX908:  [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0
  ; REGALLOC-GFX908:  GLOBAL_STORE_DWORDX2 undef %{{[0-9]+}}:vreg_64, [[SI_SPILL_V64_RESTORE]]
  ; REGALLOC-GFX908:  [[COPY_A128_TO_V128:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_A128]]
  ; REGALLOC-GFX908:  GLOBAL_STORE_DWORDX4 undef %{{[0-9]+}}:vreg_64, [[COPY_A128_TO_V128]]
  ;
  ; PEI-GFX908-LABEL: name: partial_copy
  ; PEI-GFX908: bb.0 (%ir-block.0):
  ; PEI-GFX908:  INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2949130 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
  ; PEI-GFX908:  BUFFER_STORE_DWORD_OFFSET killed $vgpr0
  ; PEI-GFX908:  $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1
  ; PEI-GFX908:  renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64
  ; PEI-GFX908:  $vgpr0 = BUFFER_LOAD_DWORD_OFFSET
  ; PEI-GFX908:  $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4
  ; PEI-GFX908:  GLOBAL_STORE_DWORDX2 undef renamable ${{.*}}, killed renamable $vgpr0_vgpr1
  ; PEI-GFX908:  renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
  ; PEI-GFX908:  GLOBAL_STORE_DWORDX4 undef renamable ${{.*}}, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3
  ;
  ; REGALLOC-GFX90A-LABEL: name: partial_copy
  ; REGALLOC-GFX90A: bb.0 (%ir-block.0):
  ; REGALLOC-GFX90A:  INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def [[VREG_64:%[0-9]+]]
  ; REGALLOC-GFX90A:  SI_SPILL_V64_SAVE [[VREG_64]], %stack.0
  ; REGALLOC-GFX90A:  [[V_MFMA_I32_4X4X4I8_A128:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64
  ; REGALLOC-GFX90A:  [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0
  ; REGALLOC-GFX90A:  GLOBAL_STORE_DWORDX2 undef %{{[0-9]+}}:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]]
  ; REGALLOC-GFX90A:  GLOBAL_STORE_DWORDX4 undef %{{[0-9]+}}:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_A128]]
  ;
  ; PEI-GFX90A-LABEL: name: partial_copy
  ; PEI-GFX90A: bb.0 (%ir-block.0):
  ; PEI-GFX90A:  INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
  ; PEI-GFX90A:  BUFFER_STORE_DWORD_OFFSET killed $vgpr0
  ; PEI-GFX90A:  $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1
  ; PEI-GFX90A:  renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64
  ; PEI-GFX90A:  $vgpr0 = BUFFER_LOAD_DWORD_OFFSET
  ; PEI-GFX90A:  $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4
  ; PEI-GFX90A:  GLOBAL_STORE_DWORDX2 undef renamable ${{.*}}, killed renamable $vgpr0_vgpr1
  ; PEI-GFX90A:  GLOBAL_STORE_DWORDX4 undef renamable ${{.*}}, killed renamable $agpr0_agpr1_agpr2_agpr3
  call void asm sideeffect "; use $0", "a" (i32 undef)
  %v0 = call <4 x i32> asm sideeffect "; def $0", "=v" ()
  %v1 = call <2 x i32> asm sideeffect "; def $0", "=v" ()
  %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0)
  store volatile <4 x i32> %v0, <4 x i32> addrspace(1)* undef
  store volatile <2 x i32> %v1, <2 x i32> addrspace(1)* undef
  store volatile <4 x i32> %mai, <4 x i32> addrspace(1)* undef
  ret void
}

declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)

attributes #0 = { nounwind "amdgpu-num-vgpr"="5" }