; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s define amdgpu_ps void @static_exact(float %arg0, float %arg1) { ; SI-LABEL: static_exact: ; SI: ; %bb.0: ; %.entry ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 ; SI-NEXT: s_andn2_b64 exec, exec, exec ; SI-NEXT: s_cbranch_scc0 .LBB0_2 ; SI-NEXT: ; %bb.1: ; %.entry ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB0_2: ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: exp null off, off, off, off done vm ; SI-NEXT: s_endpgm ; ; GFX9-LABEL: static_exact: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, exec ; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; %.entry ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: exp null off, off, off, off done vm ; GFX9-NEXT: s_endpgm ; ; GFX10-32-LABEL: static_exact: ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10-32-NEXT: ; %bb.1: ; %.entry ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB0_2: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: exp null off, off, off, off done vm ; GFX10-32-NEXT: s_endpgm ; ; GFX10-64-LABEL: static_exact: ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_andn2_b64 exec, exec, exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10-64-NEXT: ; %bb.1: ; %.entry ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB0_2: ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: exp null off, off, off, off done vm ; GFX10-64-NEXT: s_endpgm .entry: %c0 = fcmp olt float %arg0, 0.000000e+00 %c1 = fcmp oge float %arg1, 0.0 call void @llvm.amdgcn.wqm.demote(i1 false) %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 ret void } define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { ; SI-LABEL: dynamic_exact: ; SI: ; %bb.0: ; %.entry ; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 ; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 ; SI-NEXT: s_cbranch_scc0 .LBB1_2 ; SI-NEXT: ; %bb.1: ; %.entry ; SI-NEXT: s_and_b64 exec, exec, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB1_2: ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: exp null off, off, off, off done vm ; SI-NEXT: s_endpgm ; ; GFX9-LABEL: dynamic_exact: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 ; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; %.entry ; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: exp null off, off, off, off done vm ; GFX9-NEXT: s_endpgm ; ; GFX10-32-LABEL: dynamic_exact: ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1 ; GFX10-32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_xor_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0 ; GFX10-32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-32-NEXT: ; %bb.1: ; %.entry ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB1_2: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: exp null off, off, off, off done vm ; GFX10-32-NEXT: s_endpgm ; ; GFX10-64-LABEL: dynamic_exact: ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 ; GFX10-64-NEXT: s_mov_b64 s[2:3], exec ; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] ; GFX10-64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-64-NEXT: ; %bb.1: ; %.entry ; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB1_2: ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: exp null off, off, off, off done vm ; GFX10-64-NEXT: s_endpgm .entry: %c0 = fcmp olt float %arg0, 0.000000e+00 %c1 = fcmp oge float %arg1, 0.0 call void @llvm.amdgcn.wqm.demote(i1 %c1) %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 ret void } define amdgpu_ps void @branch(float %arg0, float %arg1) { ; SI-LABEL: branch: ; SI: ; %bb.0: ; %.entry ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB2_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB2_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: .LBB2_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB2_4: ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: exp null off, off, off, off done vm ; SI-NEXT: s_endpgm ; ; GFX9-LABEL: branch: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: .LBB2_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: exp null off, off, off, off done vm ; GFX9-NEXT: s_endpgm ; ; GFX10-32-LABEL: branch: ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX10-32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, -1 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 ; GFX10-32-NEXT: s_cbranch_execz .LBB2_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: .LBB2_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB2_4: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: exp null off, off, off, off done vm ; GFX10-32-NEXT: s_endpgm ; ; GFX10-64-LABEL: branch: ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] ; GFX10-64-NEXT: s_cbranch_execz .LBB2_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: .LBB2_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB2_4: ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: exp null off, off, off, off done vm ; GFX10-64-NEXT: s_endpgm .entry: %i0 = fptosi float %arg0 to i32 %i1 = fptosi float %arg1 to i32 %c0 = or i32 %i0, %i1 %c1 = and i32 %c0, 1 %c2 = icmp eq i32 %c1, 0 br i1 %c2, label %.continue, label %.demote .demote: call void @llvm.amdgcn.wqm.demote(i1 false) br label %.continue .continue: %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 ret void } define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { ; SI-LABEL: wqm_demote_1: ; SI: ; %bb.0: ; %.entry ; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc ; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; SI-NEXT: s_cbranch_execz .LBB3_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] ; SI-NEXT: .LBB3_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v0 ; SI-NEXT: s_and_b64 exec, exec, s[12:13] ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_branch .LBB3_5 ; SI-NEXT: .LBB3_4: ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: exp null off, off, off, off done vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB3_5: ; ; GFX9-LABEL: wqm_demote_1: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; GFX9-NEXT: s_cbranch_execz .LBB3_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] ; GFX9-NEXT: .LBB3_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_branch .LBB3_5 ; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: exp null off, off, off, off done vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB3_5: ; ; GFX10-32-LABEL: wqm_demote_1: ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 ; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 ; GFX10-32-NEXT: s_cbranch_execz .LBB3_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s14, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-32-NEXT: .LBB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: s_branch .LBB3_5 ; GFX10-32-NEXT: .LBB3_4: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: exp null off, off, off, off done vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB3_5: ; ; GFX10-64-LABEL: wqm_demote_1: ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: s_mov_b64 s[12:13], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; GFX10-64-NEXT: s_cbranch_execz .LBB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] ; GFX10-64-NEXT: .LBB3_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: s_branch .LBB3_5 ; GFX10-64-NEXT: .LBB3_4: ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: exp null off, off, off, off done vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB3_5: .entry: %z.cmp = fcmp olt float %z, 0.0 br i1 %z.cmp, label %.continue, label %.demote .demote: call void @llvm.amdgcn.wqm.demote(i1 false) br label %.continue .continue: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 %tex1 = extractelement <4 x float> %tex, i32 0 %coord1 = fadd float %tex0, %tex1 %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 ret <4 x float> %rtex } define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { ; SI-LABEL: wqm_demote_2: ; SI: ; %bb.0: ; %.entry ; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc ; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; SI-NEXT: s_cbranch_execz .LBB4_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 .LBB4_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] ; SI-NEXT: .LBB4_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] ; SI-NEXT: v_add_f32_e32 v0, v0, v0 ; SI-NEXT: s_and_b64 exec, exec, s[12:13] ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_branch .LBB4_5 ; SI-NEXT: .LBB4_4: ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: exp null off, off, off, off done vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB4_5: ; ; GFX9-LABEL: wqm_demote_2: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] ; GFX9-NEXT: .LBB4_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_branch .LBB4_5 ; GFX9-NEXT: .LBB4_4: ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: exp null off, off, off, off done vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB4_5: ; ; GFX10-32-LABEL: wqm_demote_2: ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 ; GFX10-32-NEXT: s_cbranch_execz .LBB4_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s14, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-32-NEXT: .LBB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: s_branch .LBB4_5 ; GFX10-32-NEXT: .LBB4_4: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: exp null off, off, off, off done vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB4_5: ; ; GFX10-64-LABEL: wqm_demote_2: ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: s_mov_b64 s[12:13], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; GFX10-64-NEXT: s_cbranch_execz .LBB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] ; GFX10-64-NEXT: .LBB4_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: s_branch .LBB4_5 ; GFX10-64-NEXT: .LBB4_4: ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: exp null off, off, off, off done vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB4_5: .entry: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 %tex1 = extractelement <4 x float> %tex, i32 0 %z.cmp = fcmp olt float %tex0, 0.0 br i1 %z.cmp, label %.continue, label %.demote .demote: call void @llvm.amdgcn.wqm.demote(i1 false) br label %.continue .continue: %coord1 = fadd float %tex0, %tex1 %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 ret <4 x float> %rtex } define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { ; SI-LABEL: wqm_demote_dynamic: ; SI: ; %bb.0: ; %.entry ; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 ; SI-NEXT: s_xor_b64 s[14:15], vcc, exec ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] ; SI-NEXT: s_cbranch_scc0 .LBB5_2 ; SI-NEXT: ; %bb.1: ; %.entry ; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[14:15] ; SI-NEXT: v_add_f32_e32 v0, v0, v0 ; SI-NEXT: s_and_b64 exec, exec, s[12:13] ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_branch .LBB5_3 ; SI-NEXT: .LBB5_2: ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: exp null off, off, off, off done vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB5_3: ; ; GFX9-LABEL: wqm_demote_dynamic: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 ; GFX9-NEXT: s_xor_b64 s[14:15], vcc, exec ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] ; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; %.entry ; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_branch .LBB5_3 ; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: exp null off, off, off, off done vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB5_3: ; ; GFX10-32-LABEL: wqm_demote_dynamic: ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo ; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13 ; GFX10-32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10-32-NEXT: ; %bb.1: ; %.entry ; GFX10-32-NEXT: s_wqm_b32 s13, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: s_branch .LBB5_3 ; GFX10-32-NEXT: .LBB5_2: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: exp null off, off, off, off done vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB5_3: ; ; GFX10-64-LABEL: wqm_demote_dynamic: ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: s_mov_b64 s[12:13], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, exec ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] ; GFX10-64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10-64-NEXT: ; %bb.1: ; %.entry ; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: s_branch .LBB5_3 ; GFX10-64-NEXT: .LBB5_2: ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: exp null off, off, off, off done vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB5_3: .entry: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 %tex1 = extractelement <4 x float> %tex, i32 0 %z.cmp = fcmp olt float %tex0, 0.0 call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) %coord1 = fadd float %tex0, %tex1 %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 ret <4 x float> %rtex } define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; SI-LABEL: wqm_deriv: ; SI: ; %bb.0: ; %.entry ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: s_movk_i32 s2, 0x3c00 ; SI-NEXT: s_bfe_u32 s3, 0, 0x100000 ; SI-NEXT: s_bfe_u32 s2, s2, 0x100000 ; SI-NEXT: s_lshl_b32 s4, s3, 16 ; SI-NEXT: s_or_b32 s4, s2, s4 ; SI-NEXT: s_lshl_b32 s2, s2, 16 ; SI-NEXT: s_or_b32 s5, s3, s2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB6_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB6_7 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[6:7] ; SI-NEXT: .LBB6_3: ; %.continue0 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 s[2:3], s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; SI-NEXT: s_and_b64 exec, exec, s[0:1] ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[2:3], s[0:1], vcc ; SI-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; SI-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; SI-NEXT: s_xor_b64 s[2:3], exec, s[6:7] ; SI-NEXT: s_cbranch_execz .LBB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB6_7 ; SI-NEXT: ; %bb.5: ; %.demote1 ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: .LBB6_6: ; %.continue1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB6_7: ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: exp null off, off, off, off done vm ; SI-NEXT: s_endpgm ; ; GFX9-LABEL: wqm_deriv: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX9-NEXT: .LBB6_3: ; %.continue0 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB6_6 ; GFX9-NEXT: ; %bb.4: ; %.demote1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX9-NEXT: ; %bb.5: ; %.demote1 ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: .LBB6_6: ; %.continue1 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB6_7: ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: exp null off, off, off, off done vm ; GFX9-NEXT: s_endpgm ; ; GFX10-32-LABEL: wqm_deriv: ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-32-NEXT: s_cbranch_execz .LBB6_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s2, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: .LBB6_3: ; %.continue0 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_mov_b32 s1, s0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 ; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_b32 s1, s0, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s1, s1, -1 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 ; GFX10-32-NEXT: s_cbranch_execz .LBB6_6 ; GFX10-32-NEXT: ; %bb.4: ; %.demote1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-32-NEXT: ; %bb.5: ; %.demote1 ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: .LBB6_6: ; %.continue1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB6_7: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: exp null off, off, off, off done vm ; GFX10-32-NEXT: s_endpgm ; ; GFX10-64-LABEL: wqm_deriv: ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10-64-NEXT: s_cbranch_execz .LBB6_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: .LBB6_3: ; %.continue0 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_b64 s[2:3], s[0:1], vcc ; GFX10-64-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] ; GFX10-64-NEXT: s_cbranch_execz .LBB6_6 ; GFX10-64-NEXT: ; %bb.4: ; %.demote1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-64-NEXT: ; %bb.5: ; %.demote1 ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: .LBB6_6: ; %.continue1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB6_7: ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: exp null off, off, off, off done vm ; GFX10-64-NEXT: s_endpgm .entry: %p0 = extractelement <2 x float> %input, i32 0 %p1 = extractelement <2 x float> %input, i32 1 %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 %argi = fptosi float %arg to i32 %cond0 = icmp eq i32 %argi, 0 br i1 %cond0, label %.continue0, label %.demote0 .demote0: call void @llvm.amdgcn.wqm.demote(i1 false) br label %.continue0 .continue0: %live = call i1 @llvm.amdgcn.live.mask() %live.cond = select i1 %live, i32 0, i32 1065353216 %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) %live.v0f = bitcast i32 %live.v0 to float %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) %live.v1f = bitcast i32 %live.v1 to float %v0 = fsub float %live.v0f, %live.v1f %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 %cond2 = and i1 %live, %cond1 br i1 %cond2, label %.continue1, label %.demote1 .demote1: call void @llvm.amdgcn.wqm.demote(i1 false) br label %.continue1 .continue1: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3 ret void } define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { ; SI-LABEL: wqm_deriv_loop: ; SI: ; %bb.0: ; %.entry ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: s_movk_i32 s2, 0x3c00 ; SI-NEXT: s_bfe_u32 s3, 0, 0x100000 ; SI-NEXT: s_bfe_u32 s2, s2, 0x100000 ; SI-NEXT: s_lshl_b32 s4, s3, 16 ; SI-NEXT: s_or_b32 s6, s2, s4 ; SI-NEXT: s_lshl_b32 s2, s2, 16 ; SI-NEXT: s_or_b32 s7, s3, s2 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB7_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[8:9] ; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_branch .LBB7_5 ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB7_8 ; SI-NEXT: .LBB7_5: ; %.continue0 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v3, v2 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] ; SI-NEXT: s_cbranch_execz .LBB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.7: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[8:9] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_and_b64 exec, exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB7_9: ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: exp null off, off, off, off done vm ; SI-NEXT: s_endpgm ; ; GFX9-LABEL: wqm_deriv_loop: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] ; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_branch .LBB7_5 ; GFX9-NEXT: .LBB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB7_8 ; GFX9-NEXT: .LBB7_5: ; %.continue0 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 ; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.6: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.7: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_branch .LBB7_4 ; GFX9-NEXT: .LBB7_8: ; %.return ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB7_9: ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: exp null off, off, off, off done vm ; GFX9-NEXT: s_endpgm ; ; GFX10-32-LABEL: wqm_deriv_loop: ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: s_mov_b32 s1, 0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-32-NEXT: s_branch .LBB7_5 ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1 ; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_8 ; GFX10-32-NEXT: .LBB7_5: ; %.continue0 ; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-32-NEXT: s_mov_b32 s2, s0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 ; GFX10-32-NEXT: s_and_b32 s2, s0, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s2, s2, -1 ; GFX10-32-NEXT: s_and_saveexec_b32 s3, s2 ; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s3 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-32-NEXT: ; %bb.6: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-32-NEXT: ; %bb.7: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: s_branch .LBB7_4 ; GFX10-32-NEXT: .LBB7_8: ; %.return ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB7_9: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: exp null off, off, off, off done vm ; GFX10-32-NEXT: s_endpgm ; ; GFX10-64-LABEL: wqm_deriv_loop: ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: s_mov_b32 s4, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] ; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-64-NEXT: s_branch .LBB7_5 ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_8 ; GFX10-64-NEXT: .LBB7_5: ; %.continue0 ; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] ; GFX10-64-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 ; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc ; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-64-NEXT: ; %bb.6: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.7: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] ; GFX10-64-NEXT: s_branch .LBB7_4 ; GFX10-64-NEXT: .LBB7_8: ; %.return ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB7_9: ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: exp null off, off, off, off done vm ; GFX10-64-NEXT: s_endpgm .entry: %p0 = extractelement <2 x float> %input, i32 0 %p1 = extractelement <2 x float> %input, i32 1 %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 %argi = fptosi float %arg to i32 %cond0 = icmp eq i32 %argi, 0 br i1 %cond0, label %.continue0, label %.demote0 .demote0: call void @llvm.amdgcn.wqm.demote(i1 false) br label %.continue0 .continue0: %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] %live = call i1 @llvm.amdgcn.live.mask() %live.cond = select i1 %live, i32 0, i32 %count %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) %live.v0f = bitcast i32 %live.v0 to float %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) %live.v1f = bitcast i32 %live.v1 to float %v0 = fsub float %live.v0f, %live.v1f %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 %cond2 = and i1 %live, %cond1 br i1 %cond2, label %.continue1, label %.demote1 .demote1: call void @llvm.amdgcn.wqm.demote(i1 false) br label %.continue1 .continue1: %next = add i32 %count, 1 %loop.cond = icmp slt i32 %next, %limit br i1 %loop.cond, label %.continue0, label %.return .return: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3 ret void } declare void @llvm.amdgcn.wqm.demote(i1) #0 declare i1 @llvm.amdgcn.live.mask() #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare float @llvm.amdgcn.wqm.f32(float) #1 declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind readnone speculatable } attributes #3 = { inaccessiblememonly nounwind } attributes #4 = { convergent nounwind readnone }