; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s ; Test that the VGPR spiller correctly switches to SGPR offsets when the ; instruction offset field would overflow, and that it accounts for memory ; swizzling. ; GCN-LABEL: test_inst_offset_kernel define amdgpu_kernel void @test_inst_offset_kernel() { entry: ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in ; the instruction offset field. %alloca = alloca i8, i32 4088, align 4, addrspace(5) %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr ; Force %a to spill. call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 store volatile i32 %a, i32 addrspace(5)* %outptr ret void } ; GCN-LABEL: test_sgpr_offset_kernel define amdgpu_kernel void @test_sgpr_offset_kernel() { entry: ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not ; fit in the instruction, and has to live in the SGPR offset. %alloca = alloca i8, i32 4092, align 4, addrspace(5) %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; 0x40000 / 64 = 4096 (for wave64) ; MUBUF: s_mov_b32 s4, 0x40000 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill ; FLATSCR: s_movk_i32 s2, 0x1000 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr ; Force %a to spill call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 store volatile i32 %a, i32 addrspace(5)* %outptr ret void } ; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack ; pointer to temporarily update, so we just crash. ; GCN-LABEL: test_sgpr_offset_function_scavenge_fail_func define void @test_sgpr_offset_function_scavenge_fail_func() #2 { entry: ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not ; fit in the instruction, and has to live in the SGPR offset. %alloca = alloca i8, i32 4096, align 4, addrspace(5) %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0 %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1 %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2 %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3 %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4 %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5 %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6 %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7 ; 0x40000 / 64 = 4096 (for wave64) %a = load volatile i32, i32 addrspace(5)* %aptr ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004 ; MUBUF-NEXT: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], s32 offen ; 4-byte Folded Spill ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1004 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004 ; MUBUF-NEXT: buffer_load_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], s32 offen ; 4-byte Folded Reload ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1004 ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload ; Force %a to spill with no free SGPRs call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) ret void } define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { entry: ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not ; fit in the instruction, and has to live in the SGPR offset. %alloca = alloca i8, i32 4096, align 4, addrspace(5) %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0 %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1 %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2 %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3 %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4 %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5 %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6 %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7 ; 0x40000 / 64 = 4096 (for wave64) %a = load volatile i32, i32 addrspace(5)* %aptr ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004 ; MUBUF: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], 0 offen ; 4-byte Folded Spill ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0x1004 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004 ; MUBUF: buffer_load_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], 0 offen ; 4-byte Folded Reload ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0x1004 ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload ; Force %a to spill with no free SGPRs call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) ret void } ; GCN-LABEL: test_sgpr_offset_subregs_kernel define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { entry: ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in ; the instruction offset field. %alloca = alloca i8, i32 4084, align 4, addrspace(5) %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8 ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]] ; 8-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr ; Force %a to spill. call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () ; Ensure the alloca sticks around. %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 %b = load volatile i32, i32 addrspace(5)* %bptr ; Ensure the spill is of the full super-reg. call void asm sideeffect "; $0", "r"(<2 x i32> %a) ret void } ; GCN-LABEL: test_inst_offset_subregs_kernel define amdgpu_kernel void @test_inst_offset_subregs_kernel() { entry: ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live ; in the SGPR offset. %alloca = alloca i8, i32 4088, align 4, addrspace(5) %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* ; 0x3ff00 / 64 = 4092 (for wave64) ; MUBUF: s_mov_b32 s4, 0x3ff00 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]] ; 8-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr ; Force %a to spill. call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () ; Ensure the alloca sticks around. %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 %b = load volatile i32, i32 addrspace(5)* %bptr ; Ensure the spill is of the full super-reg. call void asm sideeffect "; $0", "r"(<2 x i32> %a) ret void } ; GCN-LABEL: test_inst_offset_function define void @test_inst_offset_function() { entry: ; Occupy enough bytes of scratch, so the offset of the spill of %a ; just fits in the instruction offset field when the emergency stack ; slot is added. It's hard to hit the actual limit since we're also ; going to insert the emergency stack slot for large frames. %alloca = alloca i8, i32 4088, align 4, addrspace(5) %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr ; Force %a to spill. call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 store volatile i32 %a, i32 addrspace(5)* %outptr ret void } ; GCN-LABEL: test_sgpr_offset_function define void @test_sgpr_offset_function() { entry: ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not ; fit in the instruction, and has to live in the SGPR offset. %alloca = alloca i8, i32 4096, align 4, addrspace(5) %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; 0x40000 / 64 = 4096 (for wave64) ; MUBUF: s_add_i32 s4, s32, 0x40100 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill ; FLATSCR: s_add_i32 s0, s32, 0x1004 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr ; Force %a to spill call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 store volatile i32 %a, i32 addrspace(5)* %outptr ret void } ; GCN-LABEL: test_sgpr_offset_subregs_function define void @test_sgpr_offset_subregs_function() { entry: ; We want to test the spill of the last subreg of %a is the highest ; valid value for the immediate offset. We enable the emergency ; stack slot for large frames, so it's hard to get the frame layout ; exactly as we want to test it. ; ; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a ; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in ; the instruction offset field. %alloca = alloca i8, i32 4084, align 4, addrspace(5) %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4084 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4084 ; 8-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr ; Force %a to spill. call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () ; Ensure the alloca sticks around. %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 %b = load volatile i32, i32 addrspace(5)* %bptr ; Ensure the spill is of the full super-reg. call void asm sideeffect "; $0", "r"(<2 x i32> %a) ret void } ; GCN-LABEL: test_inst_offset_subregs_function define void @test_inst_offset_subregs_function() { entry: ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a ; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live ; in the SGPR offset. %alloca = alloca i8, i32 4088, align 4, addrspace(5) %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* ; 0x3ff0000 / 64 = 4092 (for wave64) ; MUBUF: s_add_i32 s4, s32, 0x3ff00 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4092 ; 8-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr ; Force %a to spill. call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () ; Ensure the alloca sticks around. %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 %b = load volatile i32, i32 addrspace(5)* %bptr ; Ensure the spill is of the full super-reg. call void asm sideeffect "; $0", "r"(<2 x i32> %a) ret void } attributes #0 = { nounwind } attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" } attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" } attributes #3 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" }