Compiler projects using llvm
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s

declare i64 @_Z13get_global_idj(i32) #0

define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)*  %buffer) {
; GFX8-LABEL: clmem_read_simplified:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s3
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX8-NEXT:    v_mov_b32_e32 v2, 0
; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX8-NEXT:    v_mov_b32_e32 v4, s35
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x800
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1000
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1800
; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[0:1]
; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT:    s_movk_i32 s0, 0x2000
; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x2800
; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
; GFX8-NEXT:    s_movk_i32 s0, 0x3000
; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x3800, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v11
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v12, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(5)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(4)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(3)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v13, v2
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v14, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v15, v2
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v16, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v17, v2
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v18, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
; GFX8-NEXT:    s_endpgm
;
; GFX900-LABEL: clmem_read_simplified:
; GFX900:       ; %bb.0: ; %entry
; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX900-NEXT:    s_mov_b32 s38, -1
; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
; GFX900-NEXT:    s_add_u32 s36, s36, s3
; GFX900-NEXT:    s_addc_u32 s37, s37, 0
; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX900-NEXT:    s_getpc_b64 s[0:1]
; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX900-NEXT:    v_mov_b32_e32 v31, v0
; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX900-NEXT:    v_mov_b32_e32 v0, 0
; GFX900-NEXT:    s_mov_b32 s32, 0
; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX900-NEXT:    v_mov_b32_e32 v2, 0
; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX900-NEXT:    v_mov_b32_e32 v4, s35
; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v0
; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
; GFX900-NEXT:    s_movk_i32 s1, 0x2000
; GFX900-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
; GFX900-NEXT:    global_load_dwordx2 v[7:8], v[0:1], off offset:2048
; GFX900-NEXT:    v_add_co_u32_e32 v9, vcc, s1, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v1, vcc
; GFX900-NEXT:    global_load_dwordx2 v[11:12], v[9:10], off offset:-4096
; GFX900-NEXT:    s_movk_i32 s0, 0x1000
; GFX900-NEXT:    v_add_co_u32_e32 v13, vcc, s0, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v1, vcc
; GFX900-NEXT:    global_load_dwordx2 v[15:16], v[13:14], off offset:2048
; GFX900-NEXT:    global_load_dwordx2 v[17:18], v[9:10], off
; GFX900-NEXT:    global_load_dwordx2 v[19:20], v[9:10], off offset:2048
; GFX900-NEXT:    s_movk_i32 s0, 0x3000
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT:    global_load_dwordx2 v[9:10], v[0:1], off
; GFX900-NEXT:    global_load_dwordx2 v[13:14], v[0:1], off offset:2048
; GFX900-NEXT:    s_waitcnt vmcnt(6)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v5
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v6, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(5)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v11, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v12, v1, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(4)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v15, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v16, v1, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(3)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v17, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v18, v1, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(2)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v19, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v20, v1, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(1)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v9, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v10, v1, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v13, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v14, v1, vcc
; GFX900-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
; GFX900-NEXT:    s_endpgm
;
; GFX10-LABEL: clmem_read_simplified:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s3
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[2:3]
; GFX10-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x1000
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v0, 0x2000
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off offset:-2048
; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[4:5], off
; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[10:11], off offset:-2048
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x3000
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[10:11], off
; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[4:5], off offset:-2048
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off
; GFX10-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(6)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v6
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(5)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(4)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(3)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v18, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v20, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v21, v1, vcc_lo
; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT:    s_endpgm
;
; GFX90A-LABEL: clmem_read_simplified:
; GFX90A:       ; %bb.0: ; %entry
; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX90A-NEXT:    s_mov_b32 s38, -1
; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
; GFX90A-NEXT:    s_add_u32 s36, s36, s3
; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX90A-NEXT:    s_getpc_b64 s[0:1]
; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
; GFX90A-NEXT:    s_mov_b32 s32, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
; GFX90A-NEXT:    v_mov_b32_e32 v1, s35
; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT:    v_lshlrev_b64 v[2:3], 3, v[2:3]
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX90A-NEXT:    s_movk_i32 s1, 0x2000
; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
; GFX90A-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:2048
; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, s1, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v3, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[8:9], off offset:-4096
; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
; GFX90A-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v3, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off offset:2048
; GFX90A-NEXT:    global_load_dwordx2 v[16:17], v[8:9], off
; GFX90A-NEXT:    global_load_dwordx2 v[18:19], v[8:9], off offset:2048
; GFX90A-NEXT:    s_movk_i32 s0, 0x3000
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:2048
; GFX90A-NEXT:    s_waitcnt vmcnt(6)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v5, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(5)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(4)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v15, v3, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(3)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v16, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v17, v3, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(2)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v18, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v19, v3, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(1)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v3, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v13, v3, vcc
; GFX90A-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
; GFX90A-NEXT:    s_endpgm
;
; GFX11-LABEL: clmem_read_simplified:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:2048
; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 0x2000
; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0x1000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b64 v[12:13], v[8:9], off offset:-4096
; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off offset:2048
; GFX11-NEXT:    v_add_co_u32 v14, vcc_lo, 0x2000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x2
; GFX11-NEXT:    global_load_b64 v[14:15], v[14:15], off offset:2048
; GFX11-NEXT:    global_load_b64 v[16:17], v[0:1], off
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT:    s_waitcnt vmcnt(6)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(5)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v12, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v13, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(4)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v10, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v11, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(3)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v8, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v9, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v14, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v15, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v16, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v17, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0)
  %conv = and i64 %call, 255
  %a0 = shl i64 %call, 7
  %idx.ext11 = and i64 %a0, 4294934528
  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*

  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
  %add.1 = add i64 %load2, %load1

  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
  %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
  %add.2 = add i64 %load3, %add.1
  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
  %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
  %add.3 = add i64 %load4, %add.2

  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
  %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
  %add.4 = add i64 %load5, %add.3
  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
  %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
  %add.5 = add i64 %load6, %add.4

  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
  %add.6 = add i64 %load7, %add.5
  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
  %add.7 = add i64 %load8, %add.6

  store i64 %add.7, i64 addrspace(1)* %saddr, align 8
  ret void
}

define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)*  %buffer) {
; GFX8-LABEL: clmem_read:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s3
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX8-NEXT:    v_mov_b32_e32 v2, 0
; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
; GFX8-NEXT:    v_and_b32_e32 v4, 0xfe000000, v0
; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 3, v[1:2]
; GFX8-NEXT:    v_mov_b32_e32 v5, s35
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v4
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s34, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x5000
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
; GFX8-NEXT:    v_mov_b32_e32 v6, 0
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT:    v_mov_b32_e32 v7, 0
; GFX8-NEXT:    s_movk_i32 s0, 0x7f
; GFX8-NEXT:  .LBB1_1: ; %for.cond.preheader
; GFX8-NEXT:    ; =>This Loop Header: Depth=1
; GFX8-NEXT:    ; Child Loop BB1_2 Depth 2
; GFX8-NEXT:    v_mov_b32_e32 v5, v3
; GFX8-NEXT:    v_mov_b32_e32 v4, v2
; GFX8-NEXT:    s_mov_b32 s1, 0
; GFX8-NEXT:  .LBB1_2: ; %for.body
; GFX8-NEXT:    ; Parent Loop BB1_1 Depth=1
; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0xffffb000, v4
; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, -1, v5, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[8:9]
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0xffffb800, v4
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, -1, v5, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11]
; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 0xffffc000, v4
; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, -1, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xffffc800, v4
; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, -1, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 0xffffd000, v4
; GFX8-NEXT:    flat_load_dwordx2 v[12:13], v[12:13]
; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[14:15]
; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, -1, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0xffffd800, v4
; GFX8-NEXT:    v_addc_u32_e32 v19, vcc, -1, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xffffe000, v4
; GFX8-NEXT:    flat_load_dwordx2 v[16:17], v[16:17]
; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[18:19]
; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, -1, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xffffe800, v4
; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, -1, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0xfffff000, v4
; GFX8-NEXT:    flat_load_dwordx2 v[20:21], v[20:21]
; GFX8-NEXT:    flat_load_dwordx2 v[22:23], v[22:23]
; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, -1, v5, vcc
; GFX8-NEXT:    s_addk_i32 s1, 0x2000
; GFX8-NEXT:    s_cmp_gt_u32 s1, 0x3fffff
; GFX8-NEXT:    s_waitcnt vmcnt(7)
; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v8, v6
; GFX8-NEXT:    v_addc_u32_e32 v27, vcc, v9, v7, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[24:25]
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xfffff800, v4
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v5, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
; GFX8-NEXT:    flat_load_dwordx2 v[24:25], v[4:5]
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x10000, v4
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(9)
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v26
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v11, v27, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(8)
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v12, v10
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v13, v11, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(7)
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v14, v10
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v15, v11, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v16, v10
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v17, v11, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(5)
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v18, v10
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v19, v11, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(4)
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v20, v10
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v21, v11, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(3)
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v22, v10
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v23, v11, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v24, v6
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v25, v7, vcc
; GFX8-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX8-NEXT:  ; %bb.3: ; %while.cond.loopexit
; GFX8-NEXT:    ; in Loop: Header=BB1_1 Depth=1
; GFX8-NEXT:    s_add_i32 s1, s0, -1
; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
; GFX8-NEXT:    s_cbranch_scc1 .LBB1_5
; GFX8-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX8-NEXT:    s_mov_b32 s0, s1
; GFX8-NEXT:    s_branch .LBB1_1
; GFX8-NEXT:  .LBB1_5: ; %while.end
; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
; GFX8-NEXT:    s_endpgm
;
; GFX900-LABEL: clmem_read:
; GFX900:       ; %bb.0: ; %entry
; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX900-NEXT:    s_mov_b32 s38, -1
; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
; GFX900-NEXT:    s_add_u32 s36, s36, s3
; GFX900-NEXT:    s_addc_u32 s37, s37, 0
; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX900-NEXT:    s_getpc_b64 s[0:1]
; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX900-NEXT:    v_mov_b32_e32 v31, v0
; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX900-NEXT:    v_mov_b32_e32 v0, 0
; GFX900-NEXT:    s_mov_b32 s32, 0
; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX900-NEXT:    v_mov_b32_e32 v2, 0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
; GFX900-NEXT:    v_and_b32_e32 v4, 0xfe000000, v0
; GFX900-NEXT:    v_lshlrev_b64 v[2:3], 3, v[1:2]
; GFX900-NEXT:    v_mov_b32_e32 v5, s35
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v4
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
; GFX900-NEXT:    v_or_b32_e32 v2, v4, v2
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
; GFX900-NEXT:    s_movk_i32 s0, 0x5000
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX900-NEXT:    s_movk_i32 s4, 0x7f
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    s_movk_i32 s2, 0xd000
; GFX900-NEXT:    s_movk_i32 s3, 0xe000
; GFX900-NEXT:    s_movk_i32 s5, 0xf000
; GFX900-NEXT:  .LBB1_1: ; %for.cond.preheader
; GFX900-NEXT:    ; =>This Loop Header: Depth=1
; GFX900-NEXT:    ; Child Loop BB1_2 Depth 2
; GFX900-NEXT:    v_mov_b32_e32 v5, v3
; GFX900-NEXT:    v_mov_b32_e32 v4, v2
; GFX900-NEXT:    s_mov_b32 s6, 0
; GFX900-NEXT:  .LBB1_2: ; %for.body
; GFX900-NEXT:    ; Parent Loop BB1_1 Depth=1
; GFX900-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, 0xffffb000, v4
; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v5, vcc
; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off offset:-4096
; GFX900-NEXT:    global_load_dwordx2 v[12:13], v[4:5], off offset:-2048
; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v4
; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v5, vcc
; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, s2, v4
; GFX900-NEXT:    v_addc_co_u32_e32 v17, vcc, -1, v5, vcc
; GFX900-NEXT:    global_load_dwordx2 v[22:23], v[14:15], off
; GFX900-NEXT:    global_load_dwordx2 v[24:25], v[16:17], off offset:-2048
; GFX900-NEXT:    v_add_co_u32_e32 v20, vcc, s3, v4
; GFX900-NEXT:    v_addc_co_u32_e32 v21, vcc, -1, v5, vcc
; GFX900-NEXT:    global_load_dwordx2 v[16:17], v[20:21], off offset:-4096
; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, s5, v4
; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v5, vcc
; GFX900-NEXT:    s_addk_i32 s6, 0x2000
; GFX900-NEXT:    s_cmp_gt_u32 s6, 0x3fffff
; GFX900-NEXT:    s_waitcnt vmcnt(4)
; GFX900-NEXT:    v_add_co_u32_e64 v28, s[0:1], v8, v6
; GFX900-NEXT:    v_addc_co_u32_e64 v29, s[0:1], v9, v7, s[0:1]
; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[20:21], off offset:-2048
; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[20:21], off
; GFX900-NEXT:    s_nop 0
; GFX900-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off offset:-2048
; GFX900-NEXT:    global_load_dwordx2 v[26:27], v[4:5], off
; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, 0x10000, v4
; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(7)
; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v18, v28
; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v19, v29, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(6)
; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v22, v14
; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v23, v15, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(5)
; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v24, v14
; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v25, v15, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(4)
; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, v16, v14
; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, v17, v15, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(3)
; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v14
; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v15, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(2)
; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(1)
; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v20, v6
; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v21, v7, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v11, v7, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v6
; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v7, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, v26, v6
; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, v27, v7, vcc
; GFX900-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX900-NEXT:  ; %bb.3: ; %while.cond.loopexit
; GFX900-NEXT:    ; in Loop: Header=BB1_1 Depth=1
; GFX900-NEXT:    s_add_i32 s0, s4, -1
; GFX900-NEXT:    s_cmp_eq_u32 s4, 0
; GFX900-NEXT:    s_cbranch_scc1 .LBB1_5
; GFX900-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX900-NEXT:    s_mov_b32 s4, s0
; GFX900-NEXT:    s_branch .LBB1_1
; GFX900-NEXT:  .LBB1_5: ; %while.end
; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off
; GFX900-NEXT:    s_endpgm
;
; GFX10-LABEL: clmem_read:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s3
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[2:3]
; GFX10-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 17, v0
; GFX10-NEXT:    v_mov_b32_e32 v4, 0
; GFX10-NEXT:    v_mov_b32_e32 v5, 0
; GFX10-NEXT:    s_movk_i32 s1, 0x7f
; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
; GFX10-NEXT:    v_and_b32_e32 v2, 0xfe000000, v3
; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v0, s34
; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s35, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, s0, s34, v2
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x5000, v3
; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
; GFX10-NEXT:  .LBB1_1: ; %for.cond.preheader
; GFX10-NEXT:    ; =>This Loop Header: Depth=1
; GFX10-NEXT:    ; Child Loop BB1_2 Depth 2
; GFX10-NEXT:    v_mov_b32_e32 v7, v3
; GFX10-NEXT:    v_mov_b32_e32 v6, v2
; GFX10-NEXT:    s_mov_b32 s2, 0
; GFX10-NEXT:  .LBB1_2: ; %for.body
; GFX10-NEXT:    ; Parent Loop BB1_1 Depth=1
; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v6, 0xffffb800
; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, -1, v7, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v6, 0xffffc800
; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, -1, v7, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, v6, 0xffffd800
; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, -1, v7, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v18, vcc_lo, v6, 0xffffe800
; GFX10-NEXT:    s_clause 0x2
; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[8:9], off offset:-2048
; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[10:11], off offset:-2048
; GFX10-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off offset:-2048
; GFX10-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, -1, v7, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v22, vcc_lo, 0xfffff000, v6
; GFX10-NEXT:    v_add_co_ci_u32_e32 v23, vcc_lo, -1, v7, vcc_lo
; GFX10-NEXT:    s_clause 0x7
; GFX10-NEXT:    global_load_dwordx2 v[24:25], v[18:19], off offset:-2048
; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[14:15], off
; GFX10-NEXT:    global_load_dwordx2 v[26:27], v[18:19], off
; GFX10-NEXT:    global_load_dwordx2 v[28:29], v[22:23], off
; GFX10-NEXT:    global_load_dwordx2 v[30:31], v[6:7], off offset:-2048
; GFX10-NEXT:    global_load_dwordx2 v[32:33], v[6:7], off
; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, 0x10000, v6
; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
; GFX10-NEXT:    s_addk_i32 s2, 0x2000
; GFX10-NEXT:    s_cmp_gt_u32 s2, 0x3fffff
; GFX10-NEXT:    s_waitcnt vmcnt(10)
; GFX10-NEXT:    v_add_co_u32 v4, s0, v12, v4
; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s0, v13, v5, s0
; GFX10-NEXT:    s_waitcnt vmcnt(6)
; GFX10-NEXT:    v_add_co_u32 v4, s0, v8, v4
; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s0, v9, v5, s0
; GFX10-NEXT:    v_add_co_u32 v4, s0, v16, v4
; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s0, v17, v5, s0
; GFX10-NEXT:    s_waitcnt vmcnt(5)
; GFX10-NEXT:    v_add_co_u32 v4, s0, v10, v4
; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s0, v11, v5, s0
; GFX10-NEXT:    v_add_co_u32 v4, s0, v20, v4
; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s0, v21, v5, s0
; GFX10-NEXT:    s_waitcnt vmcnt(4)
; GFX10-NEXT:    v_add_co_u32 v4, s0, v14, v4
; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s0, v15, v5, s0
; GFX10-NEXT:    v_add_co_u32 v4, s0, v24, v4
; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s0, v25, v5, s0
; GFX10-NEXT:    s_waitcnt vmcnt(3)
; GFX10-NEXT:    v_add_co_u32 v4, s0, v26, v4
; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s0, v27, v5, s0
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add_co_u32 v4, s0, v28, v4
; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s0, v29, v5, s0
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_add_co_u32 v4, s0, v30, v4
; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s0, v31, v5, s0
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v32, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v33, v5, vcc_lo
; GFX10-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX10-NEXT:  ; %bb.3: ; %while.cond.loopexit
; GFX10-NEXT:    ; in Loop: Header=BB1_1 Depth=1
; GFX10-NEXT:    s_add_i32 s0, s1, -1
; GFX10-NEXT:    s_cmp_eq_u32 s1, 0
; GFX10-NEXT:    s_cbranch_scc1 .LBB1_5
; GFX10-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX10-NEXT:    s_mov_b32 s1, s0
; GFX10-NEXT:    s_branch .LBB1_1
; GFX10-NEXT:  .LBB1_5: ; %while.end
; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
; GFX10-NEXT:    s_endpgm
;
; GFX90A-LABEL: clmem_read:
; GFX90A:       ; %bb.0: ; %entry
; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX90A-NEXT:    s_mov_b32 s38, -1
; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
; GFX90A-NEXT:    s_add_u32 s36, s36, s3
; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX90A-NEXT:    s_getpc_b64 s[0:1]
; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
; GFX90A-NEXT:    s_mov_b32 s32, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
; GFX90A-NEXT:    v_and_b32_e32 v4, 0xfe000000, v0
; GFX90A-NEXT:    v_mov_b32_e32 v5, s35
; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v4
; GFX90A-NEXT:    v_lshlrev_b64 v[2:3], 3, v[2:3]
; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
; GFX90A-NEXT:    v_or_b32_e32 v2, v4, v2
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
; GFX90A-NEXT:    s_movk_i32 s0, 0x5000
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX90A-NEXT:    s_movk_i32 s2, 0x7f
; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], 0, 0
; GFX90A-NEXT:    s_movk_i32 s0, 0xd000
; GFX90A-NEXT:    s_movk_i32 s1, 0xe000
; GFX90A-NEXT:    s_movk_i32 s3, 0xf000
; GFX90A-NEXT:  .LBB1_1: ; %for.cond.preheader
; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
; GFX90A-NEXT:    ; Child Loop BB1_2 Depth 2
; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT:    s_mov_b32 s4, 0
; GFX90A-NEXT:  .LBB1_2: ; %for.body
; GFX90A-NEXT:    ; Parent Loop BB1_1 Depth=1
; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, 0xffffb000, v6
; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v7, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off offset:-4096
; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[6:7], off offset:-2048
; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX90A-NEXT:    v_add_co_u32_e32 v16, vcc, s0, v6
; GFX90A-NEXT:    v_addc_co_u32_e32 v17, vcc, -1, v7, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
; GFX90A-NEXT:    v_add_co_u32_e32 v20, vcc, s1, v6
; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[14:15], off
; GFX90A-NEXT:    v_addc_co_u32_e32 v21, vcc, -1, v7, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[24:25], v[20:21], off offset:-4096
; GFX90A-NEXT:    global_load_dwordx2 v[26:27], v[20:21], off offset:-2048
; GFX90A-NEXT:    global_load_dwordx2 v[28:29], v[20:21], off
; GFX90A-NEXT:    v_add_co_u32_e32 v22, vcc, s3, v6
; GFX90A-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[20:21], v[22:23], off offset:-2048
; GFX90A-NEXT:    global_load_dwordx2 v[30:31], v[6:7], off
; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x10000, v6
; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
; GFX90A-NEXT:    s_addk_i32 s4, 0x2000
; GFX90A-NEXT:    s_cmp_gt_u32 s4, 0x3fffff
; GFX90A-NEXT:    s_waitcnt vmcnt(8)
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(7)
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v18, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(5)
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v14, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v16, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(4)
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v24, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v25, v5, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(3)
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v26, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v27, v5, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(2)
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v28, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(1)
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v20, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v10, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v30, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
; GFX90A-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX90A-NEXT:  ; %bb.3: ; %while.cond.loopexit
; GFX90A-NEXT:    ; in Loop: Header=BB1_1 Depth=1
; GFX90A-NEXT:    s_add_i32 s4, s2, -1
; GFX90A-NEXT:    s_cmp_eq_u32 s2, 0
; GFX90A-NEXT:    s_cbranch_scc1 .LBB1_5
; GFX90A-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX90A-NEXT:    s_mov_b32 s2, s4
; GFX90A-NEXT:    s_branch .LBB1_1
; GFX90A-NEXT:  .LBB1_5: ; %while.end
; GFX90A-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
; GFX90A-NEXT:    s_endpgm
;
; GFX11-LABEL: clmem_read:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xff, v0
; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_lshlrev_b32 v3, 17, v0
; GFX11-NEXT:    s_movk_i32 s1, 0x7f
; GFX11-NEXT:    v_mov_b32_e32 v5, 0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
; GFX11-NEXT:    v_and_b32_e32 v2, 0xfe000000, v3
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, v0, s34
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s35, v1, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v0, s0, s34, v2
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x5000, v3
; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
; GFX11-NEXT:  .LBB1_1: ; %for.cond.preheader
; GFX11-NEXT:    ; =>This Loop Header: Depth=1
; GFX11-NEXT:    ; Child Loop BB1_2 Depth 2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2
; GFX11-NEXT:    s_mov_b32 s2, 0
; GFX11-NEXT:  .LBB1_2: ; %for.body
; GFX11-NEXT:    ; Parent Loop BB1_1 Depth=1
; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v6, 0xffffc000
; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, -1, v7, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0xffffc000, v6
; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, -1, v7, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, 0xffffd000, v6
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b64 v[14:15], v[8:9], off offset:-4096
; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off offset:-2048
; GFX11-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, -1, v7, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v16, vcc_lo, v6, 0xffffe000
; GFX11-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, -1, v7, vcc_lo
; GFX11-NEXT:    global_load_b64 v[12:13], v[12:13], off offset:-2048
; GFX11-NEXT:    v_add_co_u32 v18, vcc_lo, 0xffffe000, v6
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b64 v[20:21], v[16:17], off offset:-4096
; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off
; GFX11-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, -1, v7, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v22, vcc_lo, 0xfffff000, v6
; GFX11-NEXT:    v_add_co_ci_u32_e32 v23, vcc_lo, -1, v7, vcc_lo
; GFX11-NEXT:    s_clause 0x5
; GFX11-NEXT:    global_load_b64 v[18:19], v[18:19], off offset:-2048
; GFX11-NEXT:    global_load_b64 v[16:17], v[16:17], off
; GFX11-NEXT:    global_load_b64 v[22:23], v[22:23], off offset:-2048
; GFX11-NEXT:    global_load_b64 v[24:25], v[6:7], off offset:-4096
; GFX11-NEXT:    global_load_b64 v[26:27], v[6:7], off offset:-2048
; GFX11-NEXT:    global_load_b64 v[28:29], v[6:7], off
; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, 0x10000, v6
; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
; GFX11-NEXT:    s_addk_i32 s2, 0x2000
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT:    s_cmp_gt_u32 s2, 0x3fffff
; GFX11-NEXT:    s_waitcnt vmcnt(10)
; GFX11-NEXT:    v_add_co_u32 v4, s0, v14, v4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s0, v15, v5, s0
; GFX11-NEXT:    s_waitcnt vmcnt(9)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v4, s0, v10, v4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s0, v11, v5, s0
; GFX11-NEXT:    s_waitcnt vmcnt(6)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v4, s0, v8, v4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s0, v9, v5, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v4, s0, v12, v4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s0, v13, v5, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v4, s0, v20, v4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s0, v21, v5, s0
; GFX11-NEXT:    s_waitcnt vmcnt(5)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v4, s0, v18, v4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s0, v19, v5, s0
; GFX11-NEXT:    s_waitcnt vmcnt(4)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v4, s0, v16, v4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s0, v17, v5, s0
; GFX11-NEXT:    s_waitcnt vmcnt(3)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v4, s0, v22, v4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s0, v23, v5, s0
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v4, s0, v24, v4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s0, v25, v5, s0
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v4, s0, v26, v4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s0, v27, v5, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v28, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v29, v5, vcc_lo
; GFX11-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX11-NEXT:  ; %bb.3: ; %while.cond.loopexit
; GFX11-NEXT:    ; in Loop: Header=BB1_1 Depth=1
; GFX11-NEXT:    s_add_i32 s0, s1, -1
; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
; GFX11-NEXT:    s_cbranch_scc1 .LBB1_5
; GFX11-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX11-NEXT:    s_mov_b32 s1, s0
; GFX11-NEXT:    s_branch .LBB1_1
; GFX11-NEXT:  .LBB1_5: ; %while.end
; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0)
  %conv = and i64 %call, 255
  %a0 = shl i64 %call, 17
  %idx.ext11 = and i64 %a0, 4261412864
  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
  %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
  %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv
  br label %for.cond.preheader

while.cond.loopexit:                              ; preds = %for.body
  %dec = add nsw i32 %dec31, -1
  %tobool = icmp eq i32 %dec31, 0
  br i1 %tobool, label %while.end, label %for.cond.preheader

for.cond.preheader:                               ; preds = %entry, %while.cond.loopexit
  %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
  %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
  br label %for.body

for.body:                                         ; preds = %for.body, %for.cond.preheader
  %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
  %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
  %conv3 = zext i32 %block.029 to i64
  %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3
  %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8
  %add = add i64 %load1, %sum.128

  %add9 = or i32 %block.029, 256
  %conv3.1 = zext i32 %add9 to i64
  %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1
  %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8
  %add.1 = add i64 %load2, %add

  %add9.1 = or i32 %block.029, 512
  %conv3.2 = zext i32 %add9.1 to i64
  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2
  %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
  %add.2 = add i64 %l3, %add.1

  %add9.2 = or i32 %block.029, 768
  %conv3.3 = zext i32 %add9.2 to i64
  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3
  %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
  %add.3 = add i64 %l4, %add.2

  %add9.3 = or i32 %block.029, 1024
  %conv3.4 = zext i32 %add9.3 to i64
  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4
  %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
  %add.4 = add i64 %l5, %add.3

  %add9.4 = or i32 %block.029, 1280
  %conv3.5 = zext i32 %add9.4 to i64
  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5
  %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
  %add.5 = add i64 %l6, %add.4

  %add9.5 = or i32 %block.029, 1536
  %conv3.6 = zext i32 %add9.5 to i64
  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6
  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
  %add.6 = add i64 %load7, %add.5

  %add9.6 = or i32 %block.029, 1792
  %conv3.7 = zext i32 %add9.6 to i64
  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7
  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
  %add.7 = add i64 %load8, %add.6

  %add9.7 = or i32 %block.029, 2048
  %conv3.8 = zext i32 %add9.7 to i64
  %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8
  %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8
  %add.8 = add i64 %load9, %add.7

  %add9.8 = or i32 %block.029, 2304
  %conv3.9 = zext i32 %add9.8 to i64
  %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9
  %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8
  %add.9 = add i64 %load10, %add.8

  %add9.9 = or i32 %block.029, 2560
  %conv3.10 = zext i32 %add9.9 to i64
  %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10
  %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8
  %add.10 = add i64 %load11, %add.9

  %add9.31 = add nuw nsw i32 %block.029, 8192
  %cmp.31 = icmp ult i32 %add9.31, 4194304
  br i1 %cmp.31, label %for.body, label %while.cond.loopexit

while.end:                                        ; preds = %while.cond.loopexit
  store i64 %add.10, i64 addrspace(1)* %a1, align 8
  ret void
}

; using 32bit address.
define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) {
; GFX8-LABEL: Address32:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s3
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX8-NEXT:    v_mov_b32_e32 v2, 0
; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX8-NEXT:    v_mov_b32_e32 v4, s35
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[1:2]
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x400
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x800
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0xc00
; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1000
; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1400
; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1800
; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1c00
; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x2000
; GFX8-NEXT:    flat_load_dword v2, v[0:1]
; GFX8-NEXT:    flat_load_dword v19, v[5:6]
; GFX8-NEXT:    flat_load_dword v7, v[7:8]
; GFX8-NEXT:    flat_load_dword v8, v[9:10]
; GFX8-NEXT:    flat_load_dword v9, v[11:12]
; GFX8-NEXT:    flat_load_dword v10, v[13:14]
; GFX8-NEXT:    flat_load_dword v11, v[15:16]
; GFX8-NEXT:    flat_load_dword v12, v[17:18]
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x2400, v0
; GFX8-NEXT:    flat_load_dword v5, v[5:6]
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dword v0, v[0:1]
; GFX8-NEXT:    s_waitcnt vmcnt(8)
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v19, v2
; GFX8-NEXT:    s_waitcnt vmcnt(7)
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v7, v1
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v8, v1
; GFX8-NEXT:    s_waitcnt vmcnt(5)
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v9, v1
; GFX8-NEXT:    s_waitcnt vmcnt(4)
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v10, v1
; GFX8-NEXT:    s_waitcnt vmcnt(3)
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v11, v1
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v12, v1
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT:    flat_store_dword v[3:4], v0
; GFX8-NEXT:    s_endpgm
;
; GFX900-LABEL: Address32:
; GFX900:       ; %bb.0: ; %entry
; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX900-NEXT:    s_mov_b32 s38, -1
; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
; GFX900-NEXT:    s_add_u32 s36, s36, s3
; GFX900-NEXT:    s_addc_u32 s37, s37, 0
; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX900-NEXT:    s_getpc_b64 s[0:1]
; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX900-NEXT:    v_mov_b32_e32 v31, v0
; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX900-NEXT:    v_mov_b32_e32 v0, 0
; GFX900-NEXT:    s_mov_b32 s32, 0
; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX900-NEXT:    v_mov_b32_e32 v2, 0
; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX900-NEXT:    v_mov_b32_e32 v4, s35
; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v0
; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 2, v[1:2]
; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
; GFX900-NEXT:    s_movk_i32 s0, 0x1000
; GFX900-NEXT:    v_add_co_u32_e32 v5, vcc, s0, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
; GFX900-NEXT:    global_load_dword v2, v[0:1], off
; GFX900-NEXT:    global_load_dword v7, v[0:1], off offset:1024
; GFX900-NEXT:    global_load_dword v8, v[0:1], off offset:2048
; GFX900-NEXT:    global_load_dword v9, v[0:1], off offset:3072
; GFX900-NEXT:    global_load_dword v10, v[5:6], off
; GFX900-NEXT:    global_load_dword v11, v[5:6], off offset:1024
; GFX900-NEXT:    global_load_dword v12, v[5:6], off offset:2048
; GFX900-NEXT:    global_load_dword v13, v[5:6], off offset:3072
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT:    global_load_dword v5, v[0:1], off
; GFX900-NEXT:    global_load_dword v6, v[0:1], off offset:1024
; GFX900-NEXT:    s_waitcnt vmcnt(8)
; GFX900-NEXT:    v_add_u32_e32 v0, v7, v2
; GFX900-NEXT:    s_waitcnt vmcnt(6)
; GFX900-NEXT:    v_add3_u32 v0, v8, v0, v9
; GFX900-NEXT:    s_waitcnt vmcnt(4)
; GFX900-NEXT:    v_add3_u32 v0, v10, v0, v11
; GFX900-NEXT:    s_waitcnt vmcnt(2)
; GFX900-NEXT:    v_add3_u32 v0, v12, v0, v13
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    v_add3_u32 v0, v5, v0, v6
; GFX900-NEXT:    global_store_dword v[3:4], v0, off
; GFX900-NEXT:    s_endpgm
;
; GFX10-LABEL: Address32:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s3
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[2:3]
; GFX10-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 0x1000
; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, 0x1000, v0
; GFX10-NEXT:    s_clause 0x4
; GFX10-NEXT:    global_load_dword v10, v[0:1], off
; GFX10-NEXT:    global_load_dword v11, v[0:1], off offset:1024
; GFX10-NEXT:    global_load_dword v12, v[4:5], off offset:1024
; GFX10-NEXT:    global_load_dword v13, v[6:7], off offset:-2048
; GFX10-NEXT:    global_load_dword v14, v[6:7], off
; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x1800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 0x2000
; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dword v15, v[8:9], off offset:1024
; GFX10-NEXT:    global_load_dword v16, v[4:5], off offset:1024
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x2
; GFX10-NEXT:    global_load_dword v4, v[6:7], off offset:-2048
; GFX10-NEXT:    global_load_dword v5, v[6:7], off
; GFX10-NEXT:    global_load_dword v8, v[0:1], off offset:1024
; GFX10-NEXT:    s_waitcnt vmcnt(8)
; GFX10-NEXT:    v_add_nc_u32_e32 v0, v11, v10
; GFX10-NEXT:    s_waitcnt vmcnt(6)
; GFX10-NEXT:    v_add3_u32 v0, v13, v0, v12
; GFX10-NEXT:    s_waitcnt vmcnt(4)
; GFX10-NEXT:    v_add3_u32 v0, v14, v0, v15
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add3_u32 v0, v4, v0, v16
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add3_u32 v0, v5, v0, v8
; GFX10-NEXT:    global_store_dword v[2:3], v0, off
; GFX10-NEXT:    s_endpgm
;
; GFX90A-LABEL: Address32:
; GFX90A:       ; %bb.0: ; %entry
; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX90A-NEXT:    s_mov_b32 s38, -1
; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
; GFX90A-NEXT:    s_add_u32 s36, s36, s3
; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX90A-NEXT:    s_getpc_b64 s[0:1]
; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
; GFX90A-NEXT:    s_mov_b32 s32, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
; GFX90A-NEXT:    v_mov_b32_e32 v1, s35
; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
; GFX90A-NEXT:    global_load_dword v6, v[2:3], off
; GFX90A-NEXT:    global_load_dword v7, v[2:3], off offset:1024
; GFX90A-NEXT:    global_load_dword v8, v[2:3], off offset:2048
; GFX90A-NEXT:    global_load_dword v9, v[2:3], off offset:3072
; GFX90A-NEXT:    global_load_dword v10, v[4:5], off
; GFX90A-NEXT:    global_load_dword v11, v[4:5], off offset:1024
; GFX90A-NEXT:    global_load_dword v12, v[4:5], off offset:2048
; GFX90A-NEXT:    global_load_dword v13, v[4:5], off offset:3072
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x2000, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX90A-NEXT:    global_load_dword v4, v[2:3], off
; GFX90A-NEXT:    global_load_dword v5, v[2:3], off offset:1024
; GFX90A-NEXT:    s_waitcnt vmcnt(8)
; GFX90A-NEXT:    v_add_u32_e32 v2, v7, v6
; GFX90A-NEXT:    s_waitcnt vmcnt(6)
; GFX90A-NEXT:    v_add3_u32 v2, v8, v2, v9
; GFX90A-NEXT:    s_waitcnt vmcnt(4)
; GFX90A-NEXT:    v_add3_u32 v2, v10, v2, v11
; GFX90A-NEXT:    s_waitcnt vmcnt(2)
; GFX90A-NEXT:    v_add3_u32 v2, v12, v2, v13
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_add3_u32 v2, v4, v2, v5
; GFX90A-NEXT:    global_store_dword v[0:1], v2, off
; GFX90A-NEXT:    s_endpgm
;
; GFX11-LABEL: Address32:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b32 v8, v[0:1], off
; GFX11-NEXT:    global_load_b32 v9, v[0:1], off offset:1024
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x1000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 0x2000
; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x5
; GFX11-NEXT:    global_load_b32 v10, v[0:1], off offset:2048
; GFX11-NEXT:    global_load_b32 v11, v[0:1], off offset:3072
; GFX11-NEXT:    global_load_b32 v12, v[6:7], off offset:-4096
; GFX11-NEXT:    global_load_b32 v13, v[4:5], off offset:1024
; GFX11-NEXT:    global_load_b32 v14, v[4:5], off offset:2048
; GFX11-NEXT:    global_load_b32 v4, v[4:5], off offset:3072
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b32 v5, v[6:7], off
; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:1024
; GFX11-NEXT:    s_waitcnt vmcnt(8)
; GFX11-NEXT:    v_add_nc_u32_e32 v1, v9, v8
; GFX11-NEXT:    s_waitcnt vmcnt(6)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add3_u32 v1, v10, v1, v11
; GFX11-NEXT:    s_waitcnt vmcnt(4)
; GFX11-NEXT:    v_add3_u32 v1, v12, v1, v13
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add3_u32 v1, v14, v1, v4
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_add3_u32 v0, v5, v1, v0
; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
entry:
   %call = tail call i64 @_Z13get_global_idj(i32 0)
   %conv = and i64 %call, 255
   %id = shl i64 %call, 7
   %idx.ext11 = and i64 %id, 4294934528
   %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
   %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*

   %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv
   %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4

   %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256
   %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4
   %add.1 = add i32 %load2, %load1

   %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512
   %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4
   %add.2 = add i32 %load3, %add.1

   %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768
   %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4
   %add.3 = add i32 %load4, %add.2

   %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024
   %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4
   %add.4 = add i32 %load5, %add.3

   %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280
   %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4
   %add.5 = add i32 %load6, %add.4

   %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536
   %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4
   %add.6 = add i32 %load7, %add.5

   %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792
   %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4
   %add.7 = add i32 %load8, %add.6

   %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048
   %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4
   %add.8 = add i32 %load9, %add.7

   %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304
   %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4
   %add.9 = add i32 %load10, %add.8

   store i32 %add.9, i32 addrspace(1)* %addr, align 4
   ret void
}

define amdgpu_kernel void @Offset64(i8 addrspace(1)*  %buffer) {
; GFX8-LABEL: Offset64:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s3
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX8-NEXT:    v_mov_b32_e32 v2, 0
; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX8-NEXT:    v_mov_b32_e32 v4, s35
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0xf000
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0xf800
; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[0:1]
; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 1, v1, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v7
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v8, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
; GFX8-NEXT:    s_endpgm
;
; GFX900-LABEL: Offset64:
; GFX900:       ; %bb.0: ; %entry
; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX900-NEXT:    s_mov_b32 s38, -1
; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
; GFX900-NEXT:    s_add_u32 s36, s36, s3
; GFX900-NEXT:    s_addc_u32 s37, s37, 0
; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX900-NEXT:    s_getpc_b64 s[0:1]
; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX900-NEXT:    v_mov_b32_e32 v31, v0
; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX900-NEXT:    v_mov_b32_e32 v0, 0
; GFX900-NEXT:    s_mov_b32 s32, 0
; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX900-NEXT:    v_mov_b32_e32 v2, 0
; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX900-NEXT:    v_mov_b32_e32 v4, s35
; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v0
; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, 0, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, 1, v1, vcc
; GFX900-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
; GFX900-NEXT:    global_load_dwordx2 v[9:10], v[7:8], off offset:-4096
; GFX900-NEXT:    s_movk_i32 s0, 0xf000
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT:    global_load_dwordx2 v[11:12], v[7:8], off
; GFX900-NEXT:    global_load_dwordx2 v[13:14], v[0:1], off offset:2048
; GFX900-NEXT:    s_waitcnt vmcnt(2)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v9, v5
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v10, v6, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v13, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v14, v1, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v11, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v12, v1, vcc
; GFX900-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
; GFX900-NEXT:    s_endpgm
;
; GFX10-LABEL: Offset64:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s3
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[2:3]
; GFX10-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0xfffff800
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off offset:-2048
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off
; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v6
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT:    s_endpgm
;
; GFX90A-LABEL: Offset64:
; GFX90A:       ; %bb.0: ; %entry
; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX90A-NEXT:    s_mov_b32 s38, -1
; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
; GFX90A-NEXT:    s_add_u32 s36, s36, s3
; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX90A-NEXT:    s_getpc_b64 s[0:1]
; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
; GFX90A-NEXT:    s_mov_b32 s32, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
; GFX90A-NEXT:    v_mov_b32_e32 v1, s35
; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT:    v_lshlrev_b64 v[2:3], 3, v[2:3]
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 1, v3, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
; GFX90A-NEXT:    s_movk_i32 s0, 0xf000
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off
; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:2048
; GFX90A-NEXT:    s_waitcnt vmcnt(2)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v5, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v13, v3, vcc
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
; GFX90A-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
; GFX90A-NEXT:    s_endpgm
;
; GFX11-LABEL: Offset64:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 1, v1, vcc_lo
; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x2
; GFX11-NEXT:    global_load_b64 v[8:9], v[6:7], off offset:-4096
; GFX11-NEXT:    global_load_b64 v[6:7], v[6:7], off
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v8, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v9, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo
; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0)
  %conv = and i64 %call, 255
  %a0 = shl i64 %call, 7
  %idx.ext11 = and i64 %a0, 4294934528
  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*

  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
  %load1 = load i64, i64 addrspace(1)* %addr1, align 8

  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400
  %load2 = load i64, i64 addrspace(1)* %addr2, align 8

  %add1 = add i64 %load2, %load1

  %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656
  %load3 = load i64, i64 addrspace(1)* %addr3, align 8

  %add2 = add i64 %load3, %add1

  %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912
  %load4 = load i64, i64 addrspace(1)* %addr4, align 8
  %add4 = add i64 %load4, %add2

  store i64 %add4, i64 addrspace(1)* %saddr, align 8
  ret void
}

; TODO: Support load4 as anchor instruction.
define amdgpu_kernel void @p32Offset64(i8 addrspace(1)*  %buffer) {
; GFX8-LABEL: p32Offset64:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s3
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX8-NEXT:    v_mov_b32_e32 v2, 0
; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX8-NEXT:    v_mov_b32_e32 v4, s35
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[1:2]
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
; GFX8-NEXT:    s_mov_b32 s0, 0x7ffff800
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
; GFX8-NEXT:    s_mov_b32 s0, 0x7ffffc00
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dword v2, v[0:1]
; GFX8-NEXT:    flat_load_dword v5, v[5:6]
; GFX8-NEXT:    flat_load_dword v6, v[7:8]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80000000, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dword v0, v[0:1]
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v5, v2
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v6, v1
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT:    flat_store_dword v[3:4], v0
; GFX8-NEXT:    s_endpgm
;
; GFX900-LABEL: p32Offset64:
; GFX900:       ; %bb.0: ; %entry
; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX900-NEXT:    s_mov_b32 s38, -1
; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
; GFX900-NEXT:    s_add_u32 s36, s36, s3
; GFX900-NEXT:    s_addc_u32 s37, s37, 0
; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX900-NEXT:    s_getpc_b64 s[0:1]
; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX900-NEXT:    v_mov_b32_e32 v31, v0
; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX900-NEXT:    v_mov_b32_e32 v0, 0
; GFX900-NEXT:    s_mov_b32 s32, 0
; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX900-NEXT:    v_mov_b32_e32 v2, 0
; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX900-NEXT:    v_mov_b32_e32 v4, s35
; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v0
; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 2, v[1:2]
; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v5, vcc, 0x7ffff000, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, 0x80000000, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
; GFX900-NEXT:    global_load_dword v2, v[0:1], off
; GFX900-NEXT:    global_load_dword v9, v[5:6], off offset:2048
; GFX900-NEXT:    global_load_dword v10, v[5:6], off offset:3072
; GFX900-NEXT:    global_load_dword v11, v[7:8], off
; GFX900-NEXT:    s_waitcnt vmcnt(2)
; GFX900-NEXT:    v_add_u32_e32 v0, v9, v2
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    v_add3_u32 v0, v10, v0, v11
; GFX900-NEXT:    global_store_dword v[3:4], v0, off
; GFX900-NEXT:    s_endpgm
;
; GFX10-LABEL: p32Offset64:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s3
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[2:3]
; GFX10-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x80000000
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    global_load_dword v6, v[0:1], off
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ffff800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x2
; GFX10-NEXT:    global_load_dword v7, v[4:5], off offset:-2048
; GFX10-NEXT:    global_load_dword v8, v[4:5], off
; GFX10-NEXT:    global_load_dword v9, v[0:1], off offset:1024
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add_nc_u32_e32 v0, v7, v6
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add3_u32 v0, v9, v0, v8
; GFX10-NEXT:    global_store_dword v[2:3], v0, off
; GFX10-NEXT:    s_endpgm
;
; GFX90A-LABEL: p32Offset64:
; GFX90A:       ; %bb.0: ; %entry
; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX90A-NEXT:    s_mov_b32 s38, -1
; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
; GFX90A-NEXT:    s_add_u32 s36, s36, s3
; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX90A-NEXT:    s_getpc_b64 s[0:1]
; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
; GFX90A-NEXT:    s_mov_b32 s32, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
; GFX90A-NEXT:    v_mov_b32_e32 v1, s35
; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7ffff000, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x80000000, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v3, vcc
; GFX90A-NEXT:    global_load_dword v8, v[2:3], off
; GFX90A-NEXT:    global_load_dword v9, v[4:5], off offset:2048
; GFX90A-NEXT:    global_load_dword v10, v[4:5], off offset:3072
; GFX90A-NEXT:    global_load_dword v11, v[6:7], off
; GFX90A-NEXT:    s_waitcnt vmcnt(2)
; GFX90A-NEXT:    v_add_u32_e32 v2, v9, v8
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_add3_u32 v2, v10, v2, v11
; GFX90A-NEXT:    global_store_dword v[0:1], v2, off
; GFX90A-NEXT:    s_endpgm
;
; GFX11-LABEL: p32Offset64:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7ffff000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, 0x80000000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x3
; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
; GFX11-NEXT:    global_load_b32 v1, v[4:5], off offset:2048
; GFX11-NEXT:    global_load_b32 v4, v[4:5], off offset:3072
; GFX11-NEXT:    global_load_b32 v5, v[6:7], off
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    v_add_nc_u32_e32 v0, v1, v0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add3_u32 v0, v4, v0, v5
; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0)
  %conv = and i64 %call, 255
  %a0 = shl i64 %call, 7
  %idx.ext11 = and i64 %a0, 4294934528
  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*

  %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv
  %load1 = load i32, i32 addrspace(1)* %addr1, align 8

  %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400
  %load2 = load i32, i32 addrspace(1)* %addr2, align 8

  %add1 = add i32 %load2, %load1

  %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656
  %load3 = load i32, i32 addrspace(1)* %addr3, align 8

  %add2 = add i32 %load3, %add1

  %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912
  %load4 = load i32, i32 addrspace(1)* %addr4, align 8
  %add4 = add i32 %load4, %add2

  store i32 %add4, i32 addrspace(1)* %saddr, align 8
  ret void
}

define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
; GFX8-LABEL: DiffBase:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s42, -1
; GFX8-NEXT:    s_mov_b32 s43, 0xe80000
; GFX8-NEXT:    s_add_u32 s40, s40, s3
; GFX8-NEXT:    s_addc_u32 s41, s41, 0
; GFX8-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x24
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_mov_b64 s[0:1], s[40:41]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[42:43]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff8000, v0
; GFX8-NEXT:    v_mov_b32_e32 v1, s37
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s36, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    v_mov_b32_e32 v3, s39
; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s38, v2
; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1000
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1800
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x2000
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x2800
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s0, v12
; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x3000
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s0, v12
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v13, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x3800
; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[8:9]
; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11]
; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s0, v12
; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v13, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[12:13], v[12:13]
; GFX8-NEXT:    s_waitcnt vmcnt(4)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(3)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v7, v3, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v10, v8
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v12, v4
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v13, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: DiffBase:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s42, -1
; GFX9-NEXT:    s_mov_b32 s43, 0xe00000
; GFX9-NEXT:    s_add_u32 s40, s40, s3
; GFX9-NEXT:    s_addc_u32 s41, s41, 0
; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x24
; GFX9-NEXT:    s_getpc_b64 s[0:1]
; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
; GFX9-NEXT:    v_mov_b32_e32 v31, v0
; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff8000, v0
; GFX9-NEXT:    v_mov_b32_e32 v1, s37
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s36, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_mov_b32_e32 v3, s39
; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, s38, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v3, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x1000, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0x2000, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:2048
; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x2000, v12
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v13, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0x3000, v12
; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v13, vcc
; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:2048
; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[4:5], off
; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[4:5], off offset:2048
; GFX9-NEXT:    s_waitcnt vmcnt(4)
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v6
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v7, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(3)
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v14, v12
; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v15, v13, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v16, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: DiffBase:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s42, -1
; GFX10-NEXT:    s_mov_b32 s43, 0x31c16000
; GFX10-NEXT:    s_add_u32 s40, s40, s3
; GFX10-NEXT:    s_addc_u32 s41, s41, 0
; GFX10-NEXT:    s_getpc_b64 s[2:3]
; GFX10-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[40:41]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[42:43]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff8000, v0
; GFX10-NEXT:    v_add_co_u32 v0, s0, s36, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s37, 0, s0
; GFX10-NEXT:    v_add_co_u32 v14, s0, s38, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s0, s39, 0, s0
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1800
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v14, 0x3000
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v15, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x2000, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off offset:-2048
; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[4:5], off
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3800, v14
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v15, vcc_lo
; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off
; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[4:5], off
; GFX10-NEXT:    s_waitcnt vmcnt(4)
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v6
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v7, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v12, v10
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v13, v11, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v16, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v17, v5, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: DiffBase:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT:    s_load_b128 s[36:39], s[0:1], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff8000, v0
; GFX11-NEXT:    v_add_co_u32 v0, s0, s36, v2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s37, 0, s0
; GFX11-NEXT:    v_add_co_u32 v10, s0, s38, v2
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1000, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    v_add_co_ci_u32_e64 v11, null, s39, 0, s0
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x2000
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, 0x2000, v10
; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v11, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0x3000, v10
; GFX11-NEXT:    global_load_b64 v[8:9], v[4:5], off offset:-4096
; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo
; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:2048
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b64 v[6:7], v[6:7], off offset:2048
; GFX11-NEXT:    global_load_b64 v[12:13], v[10:11], off
; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off
; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off offset:2048
; GFX11-NEXT:    s_waitcnt vmcnt(4)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v8
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v9, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v12, v6
; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v13, v7, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v10, v6
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v11, v7, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
                                    i8 addrspace(1)* %buffer2) {
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0)
  %conv = and i64 %call, 255
  %a0 = shl i64 %call, 7
  %idx.ext11 = and i64 %a0, 4294934528
  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11
  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*

  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11
  %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)*

  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512
  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768
  %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
  %add1 = add i64 %load2, %load1
  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024
  %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
  %add2 = add i64 %load3, %add1

  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280
  %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8

  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536
  %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
  %add3 = add i64 %load5, %load4

  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792
  %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
  %add4 = add i64 %load6, %add3

  %add5 = add i64 %add2, %add4

  store i64 %add5, i64 addrspace(1)* %saddr, align 8
  ret void
}

define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
; GFX8-LABEL: ReverseOrder:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s3
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX8-NEXT:    v_mov_b32_e32 v2, 0
; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX8-NEXT:    v_mov_b32_e32 v4, s35
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x3800
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x3000
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x2800
; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[0:1]
; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT:    s_movk_i32 s0, 0x2000
; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x1800
; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
; GFX8-NEXT:    s_movk_i32 s0, 0x1000
; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v0
; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x800, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v11
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v12, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(5)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(4)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(3)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v13, v2
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v14, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v15, v2
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v16, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v17, v2
; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v18, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
; GFX8-NEXT:    s_endpgm
;
; GFX900-LABEL: ReverseOrder:
; GFX900:       ; %bb.0: ; %entry
; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX900-NEXT:    s_mov_b32 s38, -1
; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
; GFX900-NEXT:    s_add_u32 s36, s36, s3
; GFX900-NEXT:    s_addc_u32 s37, s37, 0
; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX900-NEXT:    s_getpc_b64 s[0:1]
; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX900-NEXT:    v_mov_b32_e32 v31, v0
; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX900-NEXT:    v_mov_b32_e32 v0, 0
; GFX900-NEXT:    s_mov_b32 s32, 0
; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX900-NEXT:    v_mov_b32_e32 v2, 0
; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX900-NEXT:    v_mov_b32_e32 v4, s35
; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v0
; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
; GFX900-NEXT:    s_movk_i32 s0, 0x3000
; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, s0, v0
; GFX900-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
; GFX900-NEXT:    global_load_dwordx2 v[9:10], v[7:8], off offset:2048
; GFX900-NEXT:    global_load_dwordx2 v[11:12], v[7:8], off
; GFX900-NEXT:    s_movk_i32 s0, 0x2000
; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, s0, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
; GFX900-NEXT:    global_load_dwordx2 v[13:14], v[7:8], off offset:2048
; GFX900-NEXT:    s_movk_i32 s0, 0x1000
; GFX900-NEXT:    v_add_co_u32_e32 v15, vcc, s0, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v1, vcc
; GFX900-NEXT:    global_load_dwordx2 v[17:18], v[15:16], off
; GFX900-NEXT:    global_load_dwordx2 v[19:20], v[7:8], off
; GFX900-NEXT:    global_load_dwordx2 v[21:22], v[15:16], off offset:2048
; GFX900-NEXT:    global_load_dwordx2 v[23:24], v[0:1], off offset:2048
; GFX900-NEXT:    s_waitcnt vmcnt(6)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v9, v5
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v10, v6, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(5)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v11, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v12, v1, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(4)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v13, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v14, v1, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(2)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v19, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v20, v1, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(1)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v21, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v22, v1, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v17, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v18, v1, vcc
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v23, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v24, v1, vcc
; GFX900-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
; GFX900-NEXT:    s_endpgm
;
; GFX10-LABEL: ReverseOrder:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s3
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[2:3]
; GFX10-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, 0x3000, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[0:1], off
; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x2800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, 0x2000, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, 0x1800, v0
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[6:7], off
; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off
; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v16, vcc_lo, 0x1000, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[14:15], off
; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[16:17], off
; GFX10-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(6)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v8
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v9, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(5)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v18, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v20, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v21, v1, vcc_lo
; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT:    s_endpgm
;
; GFX90A-LABEL: ReverseOrder:
; GFX90A:       ; %bb.0: ; %entry
; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX90A-NEXT:    s_mov_b32 s38, -1
; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
; GFX90A-NEXT:    s_add_u32 s36, s36, s3
; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX90A-NEXT:    s_getpc_b64 s[0:1]
; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
; GFX90A-NEXT:    s_mov_b32 s32, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
; GFX90A-NEXT:    v_mov_b32_e32 v1, s35
; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT:    v_lshlrev_b64 v[2:3], 3, v[2:3]
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX90A-NEXT:    s_movk_i32 s0, 0x3000
; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, s0, v2
; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v3, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:2048
; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off
; GFX90A-NEXT:    s_movk_i32 s0, 0x2000
; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, s0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v3, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[6:7], off offset:2048
; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, s0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[16:17], v[14:15], off
; GFX90A-NEXT:    global_load_dwordx2 v[18:19], v[6:7], off
; GFX90A-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off offset:2048
; GFX90A-NEXT:    global_load_dwordx2 v[22:23], v[2:3], off offset:2048
; GFX90A-NEXT:    s_waitcnt vmcnt(6)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v5, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(5)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(4)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v13, v3, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(2)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v18, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v19, v3, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(1)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v20, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v21, v3, vcc
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v16, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v17, v3, vcc
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v22, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v23, v3, vcc
; GFX90A-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
; GFX90A-NEXT:    s_endpgm
;
; GFX11-LABEL: ReverseOrder:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0x2000, v0
; GFX11-NEXT:    s_clause 0x2
; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off
; GFX11-NEXT:    global_load_b64 v[8:9], v[4:5], off offset:2048
; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off
; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, 0x1000, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT:    s_clause 0x4
; GFX11-NEXT:    global_load_b64 v[14:15], v[10:11], off offset:2048
; GFX11-NEXT:    global_load_b64 v[16:17], v[12:13], off
; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off
; GFX11-NEXT:    global_load_b64 v[12:13], v[12:13], off offset:2048
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT:    s_waitcnt vmcnt(6)
; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v9, v7, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(5)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(4)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v14, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v15, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v10, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v11, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v12, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v13, v5, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v16, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v17, v5, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0)
  %conv = and i64 %call, 255
  %a0 = shl i64 %call, 7
  %idx.ext11 = and i64 %a0, 4294934528
  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*

  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
  %load1 = load i64, i64 addrspace(1)* %addr1, align 8

  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
  %add7 = add i64 %load8, %load1

  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
  %add6 = add i64 %load7, %add7

  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
  %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
  %add5 = add i64 %load6, %add6

  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
  %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
  %add4 = add i64 %load5, %add5

  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
  %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
  %add3 = add i64 %load4, %add4

  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
  %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
  %add2 = add i64 %load3, %add3

  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
  %add1 = add i64 %load2, %add2

  store i64 %add1, i64 addrspace(1)* %saddr, align 8
  ret void
}

define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) {
; GFX8-LABEL: negativeoffset:
; GFX8:       ; %bb.0: ; %entry
; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT:    s_mov_b32 s38, -1
; GFX8-NEXT:    s_mov_b32 s39, 0xe80000
; GFX8-NEXT:    s_add_u32 s36, s36, s3
; GFX8-NEXT:    s_addc_u32 s37, s37, 0
; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX8-NEXT:    s_getpc_b64 s[0:1]
; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT:    v_mov_b32_e32 v31, v0
; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NEXT:    s_mov_b32 s32, 0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX8-NEXT:    v_mov_b32_e32 v2, 0
; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX8-NEXT:    v_mov_b32_e32 v4, s35
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v4, v1, vcc
; GFX8-NEXT:    s_movk_i32 s0, 0x800
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v6, vcc
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0, v2
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, -1, v6, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
; GFX8-NEXT:    s_endpgm
;
; GFX900-LABEL: negativeoffset:
; GFX900:       ; %bb.0: ; %entry
; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX900-NEXT:    s_mov_b32 s38, -1
; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
; GFX900-NEXT:    s_add_u32 s36, s36, s3
; GFX900-NEXT:    s_addc_u32 s37, s37, 0
; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX900-NEXT:    s_getpc_b64 s[0:1]
; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX900-NEXT:    v_mov_b32_e32 v31, v0
; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX900-NEXT:    v_mov_b32_e32 v0, 0
; GFX900-NEXT:    s_mov_b32 s32, 0
; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX900-NEXT:    v_mov_b32_e32 v2, 0
; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX900-NEXT:    v_mov_b32_e32 v4, s35
; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v0
; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v0
; GFX900-NEXT:    v_addc_co_u32_e32 v6, vcc, v4, v1, vcc
; GFX900-NEXT:    s_movk_i32 s0, 0x1000
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
; GFX900-NEXT:    v_add_co_u32_e32 v5, vcc, 0, v2
; GFX900-NEXT:    v_addc_co_u32_e32 v6, vcc, -1, v6, vcc
; GFX900-NEXT:    global_load_dwordx2 v[7:8], v[0:1], off offset:-2048
; GFX900-NEXT:    global_load_dwordx2 v[9:10], v[5:6], off
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v9, v7
; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v10, v8, vcc
; GFX900-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
; GFX900-NEXT:    s_endpgm
;
; GFX10-LABEL: negativeoffset:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s38, -1
; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000
; GFX10-NEXT:    s_add_u32 s36, s36, s3
; GFX10-NEXT:    s_addc_u32 s37, s37, 0
; GFX10-NEXT:    s_getpc_b64 s[2:3]
; GFX10-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT:    v_mov_b32_e32 v31, v0
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v6
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo
; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT:    s_endpgm
;
; GFX90A-LABEL: negativeoffset:
; GFX90A:       ; %bb.0: ; %entry
; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX90A-NEXT:    s_mov_b32 s38, -1
; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
; GFX90A-NEXT:    s_add_u32 s36, s36, s3
; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX90A-NEXT:    s_getpc_b64 s[0:1]
; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
; GFX90A-NEXT:    s_mov_b32 s32, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
; GFX90A-NEXT:    v_mov_b32_e32 v1, s35
; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT:    v_lshlrev_b64 v[2:3], 3, v[2:3]
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v5, vcc
; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0, v4
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v5, vcc
; GFX90A-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v6
; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v7, vcc
; GFX90A-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
; GFX90A-NEXT:    s_endpgm
;
; GFX11-LABEL: negativeoffset:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
; GFX11-NEXT:    s_mov_b32 s32, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0, v4
; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:-2048
; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
entry:
  %call = tail call i64 @_Z13get_global_idj(i32 0) #2
  %conv = and i64 %call, 255
  %0 = shl i64 %call, 7
  %idx.ext11 = and i64 %0, 4294934528
  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
  %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*

  %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv

  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656
  %load1 = load i64, i64 addrspace(1)* %addr1, align 8

  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912
  %load2 = load i64, i64 addrspace(1)* %addr2, align 8


  %add = add i64 %load2, %load1

  store i64 %add, i64 addrspace(1)* %buffer_head, align 8
  ret void
}

attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }