; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -march=amdgcn -mcpu=gfx900 -amdgpu-aa -amdgpu-aa-wrapper -amdgpu-annotate-uniform -S < %s | FileCheck %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Check that barrier or fence in between of loads is not considered a clobber ; for the purpose of converting vector loads into scalar. @LDS = linkonce_odr hidden local_unnamed_addr addrspace(3) global i32 undef ; GCN-LABEL: {{^}}simple_barrier: ; GCN: s_load_dword s ; GCN: s_waitcnt lgkmcnt(0) ; GCN: s_barrier ; GCN: s_waitcnt lgkmcnt(0) ; GCN: ; wave barrier ; GCN-NOT: global_load_dword ; GCN: s_load_dword s ; GCN-NOT: global_load_dword ; GCN: global_store_dword define amdgpu_kernel void @simple_barrier(i32 addrspace(1)* %arg) { ; CHECK-LABEL: @simple_barrier( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0 ; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2 ; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 ; CHECK-NEXT: ret void ; bb: %i = load i32, i32 addrspace(1)* %arg, align 4 fence syncscope("workgroup") release tail call void @llvm.amdgcn.s.barrier() fence syncscope("workgroup") acquire tail call void @llvm.amdgcn.wave.barrier() %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 %i2 = load i32, i32 addrspace(1)* %i1, align 4 %i3 = add i32 %i2, %i %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 store i32 %i3, i32 addrspace(1)* %i4, align 4 ret void } ; GCN-LABEL: {{^}}memory_phi_no_clobber: ; GCN: s_load_dword s ; GCN: s_waitcnt lgkmcnt(0) ; GCN: s_waitcnt lgkmcnt(0) ; GCN: s_barrier ; GCN-NOT: global_load_dword ; GCN: s_load_dword s ; GCN-NOT: global_load_dword ; GCN: global_store_dword define amdgpu_kernel void @memory_phi_no_clobber(i32 addrspace(1)* %arg) { ; CHECK-LABEL: @memory_phi_no_clobber( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 ; CHECK: if.then: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 ; CHECK: if.else: ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 ; CHECK: if.end: ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0 ; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2 ; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 ; CHECK-NEXT: ret void ; bb: %i = load i32, i32 addrspace(1)* %arg, align 4 br i1 undef, label %if.then, label %if.else if.then: tail call void @llvm.amdgcn.s.barrier() br label %if.end if.else: fence syncscope("workgroup") release br label %if.end if.end: %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 %i2 = load i32, i32 addrspace(1)* %i1, align 4 %i3 = add i32 %i2, %i %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 store i32 %i3, i32 addrspace(1)* %i4, align 4 ret void } ; GCN-LABEL: {{^}}memory_phi_clobber1: ; GCN: s_load_dword s ; GCN: s_barrier ; GCN: global_store_dword ; GCN: global_load_dword ; GCN: global_store_dword define amdgpu_kernel void @memory_phi_clobber1(i32 addrspace(1)* %arg) { ; CHECK-LABEL: @memory_phi_clobber1( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 ; CHECK: if.then: ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3 ; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[GEP]], align 4 ; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 ; CHECK: if.else: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 ; CHECK: if.end: ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0 ; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2 ; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 ; CHECK-NEXT: ret void ; bb: %i = load i32, i32 addrspace(1)* %arg, align 4 br i1 undef, label %if.then, label %if.else if.then: %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 store i32 1, i32 addrspace(1)* %gep, align 4 br label %if.end if.else: tail call void @llvm.amdgcn.s.barrier() br label %if.end if.end: %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 %i2 = load i32, i32 addrspace(1)* %i1, align 4 %i3 = add i32 %i2, %i %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 store i32 %i3, i32 addrspace(1)* %i4, align 4 ret void } ; GCN-LABEL: {{^}}memory_phi_clobber2: ; GCN-DAG: s_load_dword s ; GCN-DAG: global_store_dword ; GCN: s_barrier ; GCN: global_load_dword ; GCN: global_store_dword define amdgpu_kernel void @memory_phi_clobber2(i32 addrspace(1)* %arg) { ; CHECK-LABEL: @memory_phi_clobber2( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 ; CHECK: if.then: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 ; CHECK: if.else: ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3 ; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[GEP]], align 4 ; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 ; CHECK: if.end: ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0 ; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2 ; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 ; CHECK-NEXT: ret void ; bb: %i = load i32, i32 addrspace(1)* %arg, align 4 br i1 undef, label %if.then, label %if.else if.then: tail call void @llvm.amdgcn.s.barrier() br label %if.end if.else: %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 store i32 1, i32 addrspace(1)* %gep, align 4 br label %if.end if.end: %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 %i2 = load i32, i32 addrspace(1)* %i1, align 4 %i3 = add i32 %i2, %i %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 store i32 %i3, i32 addrspace(1)* %i4, align 4 ret void } ; GCN-LABEL: {{^}}no_clobbering_loop1: ; GCN: s_load_dword s ; GCN: s_load_dword s ; GCN-NOT: global_load_dword ; GCN: global_store_dword define amdgpu_kernel void @no_clobbering_loop1(i32 addrspace(1)* %arg, i1 %cc) { ; CHECK-LABEL: @no_clobbering_loop1( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 ; CHECK: while.cond: ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0 ; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2 ; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() ; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 ; CHECK: end: ; CHECK-NEXT: ret void ; bb: %i = load i32, i32 addrspace(1)* %arg, align 4 br label %while.cond while.cond: %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 %i2 = load i32, i32 addrspace(1)* %i1, align 4 %i3 = add i32 %i2, %i %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 store i32 %i3, i32 addrspace(1)* %i4, align 4 tail call void @llvm.amdgcn.wave.barrier() br i1 %cc, label %while.cond, label %end end: ret void } ; GCN-LABEL: {{^}}no_clobbering_loop2: ; GCN: s_load_dword s ; GCN: s_load_dword s ; GCN-NOT: global_load_dword ; GCN: global_store_dword define amdgpu_kernel void @no_clobbering_loop2(i32 addrspace(1)* noalias %arg, i32 addrspace(1)* noalias %out, i32 %n) { ; CHECK-LABEL: @no_clobbering_loop2( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 ; CHECK: while.cond: ; CHECK-NEXT: [[C:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[INC:%.*]], [[WHILE_COND]] ] ; CHECK-NEXT: [[ACC:%.*]] = phi i32 [ [[I]], [[BB]] ], [ [[I3:%.*]], [[WHILE_COND]] ] ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i32 [[C]], !amdgpu.uniform !0 ; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: [[I3]] = add i32 [[I2]], [[ACC]] ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[C]], 1 ; CHECK-NEXT: [[CC:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] ; CHECK-NEXT: br i1 [[CC]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 ; CHECK: end: ; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; bb: %i = load i32, i32 addrspace(1)* %arg, align 4 br label %while.cond while.cond: %c = phi i32 [ 0, %bb ], [ %inc, %while.cond ] %acc = phi i32 [ %i, %bb ], [ %i3, %while.cond ] %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %c %i2 = load i32, i32 addrspace(1)* %i1, align 4 %i3 = add i32 %i2, %acc tail call void @llvm.amdgcn.wave.barrier() %inc = add nuw nsw i32 %c, 1 %cc = icmp eq i32 %inc, %n br i1 %cc, label %while.cond, label %end end: store i32 %i3, i32 addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}clobbering_loop: ; GCN: s_load_dword s ; GCN: global_load_dword ; GCN: global_store_dword define amdgpu_kernel void @clobbering_loop(i32 addrspace(1)* %arg, i32 addrspace(1)* %out, i1 %cc) { ; CHECK-LABEL: @clobbering_loop( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 ; CHECK: while.cond: ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0 ; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[OUT:%.*]], i64 1 ; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() ; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 ; CHECK: end: ; CHECK-NEXT: ret void ; bb: %i = load i32, i32 addrspace(1)* %arg, align 4 br label %while.cond while.cond: %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 %i2 = load i32, i32 addrspace(1)* %i1, align 4 %i3 = add i32 %i2, %i %i4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 %i3, i32 addrspace(1)* %i4, align 4 tail call void @llvm.amdgcn.wave.barrier() br i1 %cc, label %while.cond, label %end end: ret void } ; GCN-LABEL: {{^}}clobber_by_atomic_load: ; GCN: s_load_dword s ; GCN: global_load_dword {{.*}} glc ; GCN: global_load_dword ; GCN: global_store_dword define amdgpu_kernel void @clobber_by_atomic_load(i32 addrspace(1)* %arg) { ; CHECK-LABEL: @clobber_by_atomic_load( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2, !amdgpu.uniform !0 ; CHECK-NEXT: [[VAL:%.*]] = load atomic i32, i32 addrspace(1)* [[GEP]] seq_cst, align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3, !amdgpu.uniform !0 ; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 4 ; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 ; CHECK-NEXT: ret void ; bb: %i = load i32, i32 addrspace(1)* %arg, align 4 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 %val = load atomic i32, i32 addrspace(1)* %gep seq_cst, align 4 %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 %i2 = load i32, i32 addrspace(1)* %i1, align 4 %i3 = add i32 %i2, %i %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4 store i32 %i3, i32 addrspace(1)* %i4, align 4 ret void } ; GCN-LABEL: {{^}}no_alias_store: ; GCN: ds_write_b32 ; GCN: s_barrier ; GCN: s_load_dword s ; GCN-NOT: global_load_dword ; GCN: global_store_dword define protected amdgpu_kernel void @no_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { ; CHECK-LABEL: @no_alias_store( ; CHECK-NEXT: entry: ; CHECK-NEXT: store i32 0, i32 addrspace(3)* @LDS, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire ; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: store i32 0, i32 addrspace(3)* @LDS, align 4 fence syncscope("workgroup") release tail call void @llvm.amdgcn.s.barrier() fence syncscope("workgroup") acquire %ld = load i32, i32 addrspace(1)* %in, align 4 store i32 %ld, i32 addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}may_alias_store: ; GCN: global_store_dword ; GCN: s_barrier ; GCN: global_load_dword ; GCN: global_store_dword define protected amdgpu_kernel void @may_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { ; CHECK-LABEL: @may_alias_store( ; CHECK-NEXT: entry: ; CHECK-NEXT: store i32 0, i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire ; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4 ; CHECK-NEXT: ret void ; entry: store i32 0, i32 addrspace(1)* %out, align 4 fence syncscope("workgroup") release tail call void @llvm.amdgcn.s.barrier() fence syncscope("workgroup") acquire %ld = load i32, i32 addrspace(1)* %in, align 4 store i32 %ld, i32 addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}no_alias_volatile_store: ; GCN: ds_write_b32 ; GCN: s_barrier ; GCN: s_load_dword s ; GCN-NOT: global_load_dword ; GCN: global_store_dword define protected amdgpu_kernel void @no_alias_volatile_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { ; CHECK-LABEL: @no_alias_volatile_store( ; CHECK-NEXT: entry: ; CHECK-NEXT: store volatile i32 0, i32 addrspace(3)* @LDS, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire ; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: store volatile i32 0, i32 addrspace(3)* @LDS, align 4 fence syncscope("workgroup") release tail call void @llvm.amdgcn.s.barrier() fence syncscope("workgroup") acquire %ld = load i32, i32 addrspace(1)* %in, align 4 store i32 %ld, i32 addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}no_alias_atomic_rmw_relaxed: ; GCN: ds_add_u32 ; GCN: s_load_dword s ; GCN-NOT: global_load_dword ; GCN: global_store_dword define protected amdgpu_kernel void @no_alias_atomic_rmw_relaxed(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { ; CHECK-LABEL: @no_alias_atomic_rmw_relaxed( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 monotonic, align 4 ; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: %unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 monotonic %ld = load i32, i32 addrspace(1)* %in, align 4 store i32 %ld, i32 addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}no_alias_atomic_cmpxchg: ; GCN: ds_cmpst_b32 ; GCN: s_load_dword s ; GCN-NOT: global_load_dword ; GCN: global_store_dword define protected amdgpu_kernel void @no_alias_atomic_cmpxchg(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %swap) { ; CHECK-LABEL: @no_alias_atomic_cmpxchg( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UNUSED:%.*]] = cmpxchg i32 addrspace(3)* @LDS, i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire ; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: %unused = cmpxchg i32 addrspace(3)* @LDS, i32 7, i32 %swap seq_cst monotonic fence syncscope("workgroup") release tail call void @llvm.amdgcn.s.barrier() fence syncscope("workgroup") acquire %ld = load i32, i32 addrspace(1)* %in, align 4 store i32 %ld, i32 addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}no_alias_atomic_rmw: ; GCN: ds_add_u32 ; GCN: s_load_dword s ; GCN-NOT: global_load_dword ; GCN: global_store_dword define protected amdgpu_kernel void @no_alias_atomic_rmw(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { ; CHECK-LABEL: @no_alias_atomic_rmw( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire ; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: %unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst fence syncscope("workgroup") release tail call void @llvm.amdgcn.s.barrier() fence syncscope("workgroup") acquire %ld = load i32, i32 addrspace(1)* %in, align 4 store i32 %ld, i32 addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}may_alias_atomic_cmpxchg: ; GCN: global_atomic_cmpswap ; GCN: global_load_dword ; GCN: global_store_dword define protected amdgpu_kernel void @may_alias_atomic_cmpxchg(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %swap) { ; CHECK-LABEL: @may_alias_atomic_cmpxchg( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UNUSED:%.*]] = cmpxchg i32 addrspace(1)* [[OUT:%.*]], i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire ; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4 ; CHECK-NEXT: ret void ; entry: %unused = cmpxchg i32 addrspace(1)* %out, i32 7, i32 %swap seq_cst monotonic fence syncscope("workgroup") release tail call void @llvm.amdgcn.s.barrier() fence syncscope("workgroup") acquire %ld = load i32, i32 addrspace(1)* %in, align 4 store i32 %ld, i32 addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}may_alias_atomic_rmw: ; GCN: global_atomic_add ; GCN: global_load_dword ; GCN: global_store_dword define protected amdgpu_kernel void @may_alias_atomic_rmw(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { ; CHECK-LABEL: @may_alias_atomic_rmw( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(1)* [[OUT:%.*]], i32 5 seq_cst, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire ; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4 ; CHECK-NEXT: ret void ; entry: %unused = atomicrmw add i32 addrspace(1)* %out, i32 5 seq_cst fence syncscope("workgroup") release tail call void @llvm.amdgcn.s.barrier() fence syncscope("workgroup") acquire %ld = load i32, i32 addrspace(1)* %in, align 4 store i32 %ld, i32 addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_clobber: ; CGN: global_store_dword ; CGN: global_store_dword ; GCN: ds_add_u32 ; GCN: global_load_dword ; GCN: global_store_dword define protected amdgpu_kernel void @no_alias_atomic_rmw_then_clobber(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 addrspace(1)* noalias %noalias) { ; CHECK-LABEL: @no_alias_atomic_rmw_then_clobber( ; CHECK-NEXT: entry: ; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: store i32 2, i32 addrspace(1)* [[NOALIAS:%.*]], align 4 ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire ; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4 ; CHECK-NEXT: ret void ; entry: store i32 1, i32 addrspace(1)* %out, align 4 store i32 2, i32 addrspace(1)* %noalias, align 4 %unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst fence syncscope("workgroup") release tail call void @llvm.amdgcn.s.barrier() fence syncscope("workgroup") acquire %ld = load i32, i32 addrspace(1)* %in, align 4 store i32 %ld, i32 addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_no_alias_store: ; CGN: global_store_dword ; GCN: ds_add_u32 ; GCN: s_load_dword s ; GCN-NOT: global_load_dword ; GCN: global_store_dword define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 addrspace(1)* noalias %noalias) { ; CHECK-LABEL: @no_alias_atomic_rmw_then_no_alias_store( ; CHECK-NEXT: entry: ; CHECK-NEXT: store i32 2, i32 addrspace(1)* [[NOALIAS:%.*]], align 4 ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire ; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: store i32 2, i32 addrspace(1)* %noalias, align 4 %unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst fence syncscope("workgroup") release tail call void @llvm.amdgcn.s.barrier() fence syncscope("workgroup") acquire %ld = load i32, i32 addrspace(1)* %in, align 4 store i32 %ld, i32 addrspace(1)* %out, align 4 ret void } declare void @llvm.amdgcn.s.barrier() declare void @llvm.amdgcn.wave.barrier()