; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { ; GFX7-LABEL: s_mul_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mul_i32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i16: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } define i16 @v_mul_i16(i16 %num, i16 %den) { ; GFX7-LABEL: v_mul_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_mul_i16: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) { ; GFX7-LABEL: s_mul_i16_zeroext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mul_i32 s0, s0, s1 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i16_zeroext: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_zeroext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i16_zeroext: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) { ; GFX7-LABEL: v_mul_i16_zeroext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i16_zeroext: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i16_zeroext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_mul_i16_zeroext: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) { ; GFX7-LABEL: s_mul_i16_signext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mul_i32 s0, s0, s1 ; GFX7-NEXT: s_sext_i32_i16 s0, s0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i16_signext: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_signext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i16_signext: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) { ; GFX7-LABEL: v_mul_i16_signext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i16_signext: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i16_signext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_mul_i16_signext: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) { ; GCN-LABEL: s_mul_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_mul_i32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i32: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i32 %num, %den ret i32 %result } define i32 @v_mul_i32(i32 %num, i32 %den) { ; GCN-LABEL: v_mul_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_mul_i32: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = mul i32 %num, %den ret i32 %result } define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) { ; GCN-LABEL: s_mul_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_mul_i32 s0, s0, s2 ; GCN-NEXT: s_mul_i32 s1, s1, s3 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_v2i32: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2 ; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul <2 x i32> %num, %den ret <2 x i32> %result } define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) { ; GCN-LABEL: v_mul_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v0, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_mul_v2i32: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX10PLUS-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = mul <2 x i32> %num, %den ret <2 x i32> %result } define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) { ; GFX7-LABEL: s_mul_i33: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX7-NEXT: s_mul_i32 s4, s0, s2 ; GFX7-NEXT: s_mul_i32 s0, s0, s3 ; GFX7-NEXT: s_mul_i32 s1, s1, s2 ; GFX7-NEXT: v_readfirstlane_b32 s5, v0 ; GFX7-NEXT: s_add_u32 s0, s0, s5 ; GFX7-NEXT: s_add_u32 s1, s1, s0 ; GFX7-NEXT: s_mov_b32 s0, s4 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i33: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: s_mul_i32 s4, s0, s2 ; GFX8-NEXT: s_mul_i32 s0, s0, s3 ; GFX8-NEXT: s_mul_i32 s1, s1, s2 ; GFX8-NEXT: v_readfirstlane_b32 s5, v0 ; GFX8-NEXT: s_add_u32 s0, s0, s5 ; GFX8-NEXT: s_add_u32 s1, s1, s0 ; GFX8-NEXT: s_mov_b32 s0, s4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mul_i32 s4, s0, s2 ; GFX9-NEXT: s_mul_hi_u32 s5, s0, s2 ; GFX9-NEXT: s_mul_i32 s0, s0, s3 ; GFX9-NEXT: s_add_u32 s0, s0, s5 ; GFX9-NEXT: s_mul_i32 s1, s1, s2 ; GFX9-NEXT: s_add_u32 s1, s1, s0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i33: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_mul_hi_u32 s4, s0, s2 ; GFX10PLUS-NEXT: s_mul_i32 s3, s0, s3 ; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s2 ; GFX10PLUS-NEXT: s_add_i32 s3, s4, s3 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2 ; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i33 %num, %den ret i33 %result } define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) { ; GFX7-LABEL: s_mul_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX7-NEXT: s_mul_i32 s4, s0, s2 ; GFX7-NEXT: s_mul_i32 s0, s0, s3 ; GFX7-NEXT: s_mul_i32 s1, s1, s2 ; GFX7-NEXT: v_readfirstlane_b32 s5, v0 ; GFX7-NEXT: s_add_u32 s0, s0, s5 ; GFX7-NEXT: s_add_u32 s1, s1, s0 ; GFX7-NEXT: s_mov_b32 s0, s4 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: s_mul_i32 s4, s0, s2 ; GFX8-NEXT: s_mul_i32 s0, s0, s3 ; GFX8-NEXT: s_mul_i32 s1, s1, s2 ; GFX8-NEXT: v_readfirstlane_b32 s5, v0 ; GFX8-NEXT: s_add_u32 s0, s0, s5 ; GFX8-NEXT: s_add_u32 s1, s1, s0 ; GFX8-NEXT: s_mov_b32 s0, s4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mul_i32 s4, s0, s2 ; GFX9-NEXT: s_mul_hi_u32 s5, s0, s2 ; GFX9-NEXT: s_mul_i32 s0, s0, s3 ; GFX9-NEXT: s_add_u32 s0, s0, s5 ; GFX9-NEXT: s_mul_i32 s1, s1, s2 ; GFX9-NEXT: s_add_u32 s1, s1, s0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i64: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_mul_hi_u32 s4, s0, s2 ; GFX10PLUS-NEXT: s_mul_i32 s3, s0, s3 ; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s2 ; GFX10PLUS-NEXT: s_add_i32 s3, s4, s3 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2 ; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i64 %num, %den ret i64 %result } define i64 @v_mul_i64(i64 %num, i64 %den) { ; GCN-LABEL: v_mul_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v4, v0 ; GCN-NEXT: v_mov_b32_e32 v5, v1 ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] ; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0 ; GFX10-NEXT: v_mul_lo_u32 v3, v4, v3 ; GFX10-NEXT: v_mul_lo_u32 v2, v5, v2 ; GFX10-NEXT: v_add3_u32 v1, v1, v3, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 ; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3 ; GFX11-NEXT: v_mul_lo_u32 v2, v5, v2 ; GFX11-NEXT: v_add3_u32 v1, v1, v3, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = mul i64 %num, %den ret i64 %result } define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) { ; GFX7-LABEL: s_mul_i96: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX7-NEXT: s_mul_i32 s5, s0, s5 ; GFX7-NEXT: v_readfirstlane_b32 s7, v0 ; GFX7-NEXT: s_mul_i32 s8, s1, s4 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_add_u32 s5, s8, s5 ; GFX7-NEXT: s_mul_i32 s2, s2, s3 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s3 ; GFX7-NEXT: s_mul_i32 s6, s0, s3 ; GFX7-NEXT: s_add_u32 s2, s2, s5 ; GFX7-NEXT: s_mul_i32 s0, s0, s4 ; GFX7-NEXT: v_readfirstlane_b32 s4, v1 ; GFX7-NEXT: s_add_u32 s0, s0, s7 ; GFX7-NEXT: s_addc_u32 s2, s4, s2 ; GFX7-NEXT: s_mul_i32 s1, s1, s3 ; GFX7-NEXT: v_readfirstlane_b32 s3, v0 ; GFX7-NEXT: s_add_u32 s1, s1, s0 ; GFX7-NEXT: s_addc_u32 s2, s3, s2 ; GFX7-NEXT: s_mov_b32 s0, s6 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i96: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX8-NEXT: s_mul_i32 s5, s0, s5 ; GFX8-NEXT: v_readfirstlane_b32 s7, v0 ; GFX8-NEXT: s_mul_i32 s8, s1, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_add_u32 s5, s8, s5 ; GFX8-NEXT: s_mul_i32 s2, s2, s3 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s3 ; GFX8-NEXT: s_mul_i32 s6, s0, s3 ; GFX8-NEXT: s_add_u32 s2, s2, s5 ; GFX8-NEXT: s_mul_i32 s0, s0, s4 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: s_add_u32 s0, s0, s7 ; GFX8-NEXT: s_addc_u32 s2, s4, s2 ; GFX8-NEXT: s_mul_i32 s1, s1, s3 ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 ; GFX8-NEXT: s_add_u32 s1, s1, s0 ; GFX8-NEXT: s_addc_u32 s2, s3, s2 ; GFX8-NEXT: s_mov_b32 s0, s6 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i96: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mul_i32 s5, s0, s5 ; GFX9-NEXT: s_mul_i32 s8, s1, s4 ; GFX9-NEXT: s_add_u32 s5, s8, s5 ; GFX9-NEXT: s_mul_i32 s2, s2, s3 ; GFX9-NEXT: s_mul_hi_u32 s7, s0, s3 ; GFX9-NEXT: s_add_u32 s2, s2, s5 ; GFX9-NEXT: s_mul_i32 s5, s0, s4 ; GFX9-NEXT: s_mul_i32 s6, s0, s3 ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s4 ; GFX9-NEXT: s_add_u32 s4, s5, s7 ; GFX9-NEXT: s_addc_u32 s0, s0, s2 ; GFX9-NEXT: s_mul_i32 s2, s1, s3 ; GFX9-NEXT: s_mul_hi_u32 s3, s1, s3 ; GFX9-NEXT: s_add_u32 s1, s2, s4 ; GFX9-NEXT: s_addc_u32 s2, s3, s0 ; GFX9-NEXT: s_mov_b32 s0, s6 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i96: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_mul_i32 s6, s0, s5 ; GFX10PLUS-NEXT: s_mul_i32 s7, s1, s4 ; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s3 ; GFX10PLUS-NEXT: s_add_i32 s6, s6, s7 ; GFX10PLUS-NEXT: s_mul_hi_u32 s7, s0, s3 ; GFX10PLUS-NEXT: s_add_i32 s6, s6, s2 ; GFX10PLUS-NEXT: s_mul_i32 s2, s0, s4 ; GFX10PLUS-NEXT: s_mul_i32 s5, s0, s3 ; GFX10PLUS-NEXT: s_mul_hi_u32 s0, s0, s4 ; GFX10PLUS-NEXT: s_add_u32 s2, s2, s7 ; GFX10PLUS-NEXT: s_mul_i32 s4, s1, s3 ; GFX10PLUS-NEXT: s_addc_u32 s0, s0, s6 ; GFX10PLUS-NEXT: s_mul_hi_u32 s3, s1, s3 ; GFX10PLUS-NEXT: s_add_u32 s1, s4, s2 ; GFX10PLUS-NEXT: s_addc_u32 s2, s3, s0 ; GFX10PLUS-NEXT: s_mov_b32 s0, s5 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i96 %num, %den %cast = bitcast i96 %result to <3 x i32> ret <3 x i32> %cast } define i96 @v_mul_i96(i96 %num, i96 %den) { ; GCN-LABEL: v_mul_i96: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v6, v0 ; GCN-NEXT: v_mov_b32_e32 v7, v1 ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 ; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1] ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0 ; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9] ; GCN-NEXT: v_mov_b32_e32 v2, v8 ; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2] ; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i96: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3 ; GFX10-NEXT: v_mul_lo_u32 v5, v6, v5 ; GFX10-NEXT: v_mul_lo_u32 v8, v7, v4 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0 ; GFX10-NEXT: v_add3_u32 v2, v5, v8, v2 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2] ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i96: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 ; GFX11-NEXT: v_mul_lo_u32 v2, v2, v3 ; GFX11-NEXT: v_mul_lo_u32 v5, v6, v5 ; GFX11-NEXT: v_mul_lo_u32 v8, v7, v4 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0 ; GFX11-NEXT: v_add3_u32 v2, v5, v8, v2 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2] ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2] ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = mul i96 %num, %den ret i96 %result } define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) { ; GFX7-LABEL: s_mul_i128: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mul_hi_u32 v2, s1, v1 ; GFX7-NEXT: s_mul_i32 s10, s0, s6 ; GFX7-NEXT: v_readfirstlane_b32 s9, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s13, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4 ; GFX7-NEXT: s_mul_i32 s12, s1, s5 ; GFX7-NEXT: v_readfirstlane_b32 s11, v0 ; GFX7-NEXT: s_add_u32 s10, s12, s10 ; GFX7-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_addc_u32 s11, s13, s11 ; GFX7-NEXT: s_mul_i32 s12, s2, s4 ; GFX7-NEXT: v_readfirstlane_b32 s13, v2 ; GFX7-NEXT: s_add_u32 s10, s12, s10 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s4 ; GFX7-NEXT: s_addc_u32 s11, s13, s11 ; GFX7-NEXT: s_mul_i32 s12, s0, s5 ; GFX7-NEXT: v_readfirstlane_b32 s13, v1 ; GFX7-NEXT: s_add_u32 s9, s12, s9 ; GFX7-NEXT: s_addc_u32 s10, s13, s10 ; GFX7-NEXT: s_mul_i32 s13, s1, s4 ; GFX7-NEXT: s_cselect_b32 s12, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s14, v0 ; GFX7-NEXT: s_add_u32 s9, s13, s9 ; GFX7-NEXT: s_mul_i32 s8, s0, s4 ; GFX7-NEXT: s_addc_u32 s10, s14, s10 ; GFX7-NEXT: s_mul_i32 s0, s0, s7 ; GFX7-NEXT: s_addc_u32 s0, s11, s0 ; GFX7-NEXT: s_mul_i32 s1, s1, s6 ; GFX7-NEXT: s_cmp_lg_u32 s12, 0 ; GFX7-NEXT: s_addc_u32 s0, s0, s1 ; GFX7-NEXT: s_mul_i32 s2, s2, s5 ; GFX7-NEXT: s_add_u32 s0, s2, s0 ; GFX7-NEXT: s_mul_i32 s3, s3, s4 ; GFX7-NEXT: s_add_u32 s3, s3, s0 ; GFX7-NEXT: s_mov_b32 s0, s8 ; GFX7-NEXT: s_mov_b32 s1, s9 ; GFX7-NEXT: s_mov_b32 s2, s10 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mul_hi_u32 v2, s1, v1 ; GFX8-NEXT: s_mul_i32 s10, s0, s6 ; GFX8-NEXT: v_readfirstlane_b32 s9, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s13, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4 ; GFX8-NEXT: s_mul_i32 s12, s1, s5 ; GFX8-NEXT: v_readfirstlane_b32 s11, v0 ; GFX8-NEXT: s_add_u32 s10, s12, s10 ; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_addc_u32 s11, s13, s11 ; GFX8-NEXT: s_mul_i32 s12, s2, s4 ; GFX8-NEXT: v_readfirstlane_b32 s13, v2 ; GFX8-NEXT: s_add_u32 s10, s12, s10 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s4 ; GFX8-NEXT: s_addc_u32 s11, s13, s11 ; GFX8-NEXT: s_mul_i32 s12, s0, s5 ; GFX8-NEXT: v_readfirstlane_b32 s13, v1 ; GFX8-NEXT: s_add_u32 s9, s12, s9 ; GFX8-NEXT: s_addc_u32 s10, s13, s10 ; GFX8-NEXT: s_mul_i32 s13, s1, s4 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s14, v0 ; GFX8-NEXT: s_add_u32 s9, s13, s9 ; GFX8-NEXT: s_mul_i32 s8, s0, s4 ; GFX8-NEXT: s_addc_u32 s10, s14, s10 ; GFX8-NEXT: s_mul_i32 s0, s0, s7 ; GFX8-NEXT: s_addc_u32 s0, s11, s0 ; GFX8-NEXT: s_mul_i32 s1, s1, s6 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_addc_u32 s0, s0, s1 ; GFX8-NEXT: s_mul_i32 s2, s2, s5 ; GFX8-NEXT: s_add_u32 s0, s2, s0 ; GFX8-NEXT: s_mul_i32 s3, s3, s4 ; GFX8-NEXT: s_add_u32 s3, s3, s0 ; GFX8-NEXT: s_mov_b32 s0, s8 ; GFX8-NEXT: s_mov_b32 s1, s9 ; GFX8-NEXT: s_mov_b32 s2, s10 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mul_i32 s10, s0, s6 ; GFX9-NEXT: s_mul_i32 s12, s1, s5 ; GFX9-NEXT: s_mul_hi_u32 s11, s0, s6 ; GFX9-NEXT: s_mul_hi_u32 s13, s1, s5 ; GFX9-NEXT: s_add_u32 s10, s12, s10 ; GFX9-NEXT: s_addc_u32 s11, s13, s11 ; GFX9-NEXT: s_mul_i32 s12, s2, s4 ; GFX9-NEXT: s_mul_hi_u32 s13, s2, s4 ; GFX9-NEXT: s_add_u32 s10, s12, s10 ; GFX9-NEXT: s_mul_hi_u32 s9, s0, s4 ; GFX9-NEXT: s_addc_u32 s11, s13, s11 ; GFX9-NEXT: s_mul_i32 s12, s0, s5 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s5 ; GFX9-NEXT: s_add_u32 s9, s12, s9 ; GFX9-NEXT: s_addc_u32 s10, s13, s10 ; GFX9-NEXT: s_mul_i32 s13, s1, s4 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s14, s1, s4 ; GFX9-NEXT: s_add_u32 s9, s13, s9 ; GFX9-NEXT: s_mul_i32 s8, s0, s4 ; GFX9-NEXT: s_addc_u32 s10, s14, s10 ; GFX9-NEXT: s_mul_i32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s0, s11, s0 ; GFX9-NEXT: s_mul_i32 s1, s1, s6 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-NEXT: s_mul_i32 s2, s2, s5 ; GFX9-NEXT: s_add_u32 s0, s2, s0 ; GFX9-NEXT: s_mul_i32 s3, s3, s4 ; GFX9-NEXT: s_add_u32 s3, s3, s0 ; GFX9-NEXT: s_mov_b32 s0, s8 ; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: s_mov_b32 s2, s10 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i128: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_mul_i32 s9, s0, s6 ; GFX10PLUS-NEXT: s_mul_i32 s11, s1, s5 ; GFX10PLUS-NEXT: s_mul_hi_u32 s10, s0, s6 ; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s1, s5 ; GFX10PLUS-NEXT: s_add_u32 s9, s11, s9 ; GFX10PLUS-NEXT: s_mul_i32 s11, s2, s4 ; GFX10PLUS-NEXT: s_addc_u32 s10, s12, s10 ; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s2, s4 ; GFX10PLUS-NEXT: s_mul_hi_u32 s8, s0, s4 ; GFX10PLUS-NEXT: s_add_u32 s9, s11, s9 ; GFX10PLUS-NEXT: s_mul_i32 s11, s0, s5 ; GFX10PLUS-NEXT: s_addc_u32 s10, s12, s10 ; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s0, s5 ; GFX10PLUS-NEXT: s_add_u32 s8, s11, s8 ; GFX10PLUS-NEXT: s_addc_u32 s9, s12, s9 ; GFX10PLUS-NEXT: s_mul_i32 s12, s1, s4 ; GFX10PLUS-NEXT: s_mul_hi_u32 s13, s1, s4 ; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s8, s12, s8 ; GFX10PLUS-NEXT: s_mul_i32 s12, s0, s7 ; GFX10PLUS-NEXT: s_addc_u32 s7, s13, s9 ; GFX10PLUS-NEXT: s_addc_u32 s9, s10, s12 ; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s6 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s5 ; GFX10PLUS-NEXT: s_addc_u32 s1, s9, s1 ; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s4 ; GFX10PLUS-NEXT: s_add_i32 s1, s1, s2 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s4 ; GFX10PLUS-NEXT: s_add_i32 s3, s1, s3 ; GFX10PLUS-NEXT: s_mov_b32 s1, s8 ; GFX10PLUS-NEXT: s_mov_b32 s2, s7 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i128 %num, %den %cast = bitcast i128 %result to <4 x i32> ret <4 x i32> %cast } define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX7-LABEL: v_mul_i128: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v8, v0 ; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX7-NEXT: v_mov_b32_e32 v10, v2 ; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 ; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] ; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v2, v11 ; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] ; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] ; GFX7-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] ; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc ; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] ; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v8, v0 ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, v2 ; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 ; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] ; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX8-NEXT: v_mov_b32_e32 v2, v11 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] ; GFX8-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc ; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, v2 ; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 ; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] ; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v11 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5] ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v6, vcc ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, v0 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, v4 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v6, 0 ; GFX10-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12] ; GFX10-NEXT: v_mov_b32_e32 v2, v11 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] ; GFX10-NEXT: v_mul_lo_u32 v5, v10, v5 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2] ; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v12, v7, s4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo ; GFX10-NEXT: v_add3_u32 v3, v4, v5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 ; GFX11-NEXT: v_mov_b32_e32 v10, v2 ; GFX11-NEXT: v_mul_lo_u32 v3, v3, v4 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0 ; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v9, v5, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v4, 0 ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v10, v4, v[11:12] ; GFX11-NEXT: v_mov_b32_e32 v2, v11 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] ; GFX11-NEXT: v_mul_lo_u32 v5, v10, v5 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v4, v[1:2] ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo ; GFX11-NEXT: v_add3_u32 v3, v4, v5, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = mul i128 %num, %den ret i128 %result } define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-LABEL: s_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s16, s0 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_mul_hi_u32 v2, s1, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, s16, v1 ; GFX7-NEXT: v_readfirstlane_b32 s17, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: v_readfirstlane_b32 s21, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_readfirstlane_b32 s23, v1 ; GFX7-NEXT: v_readfirstlane_b32 s19, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX7-NEXT: v_mul_hi_u32 v3, v2, s8 ; GFX7-NEXT: v_mov_b32_e32 v4, s11 ; GFX7-NEXT: s_mul_i32 s18, s16, s10 ; GFX7-NEXT: v_readfirstlane_b32 s24, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s12 ; GFX7-NEXT: v_readfirstlane_b32 s22, v3 ; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX7-NEXT: s_mul_i32 s20, s1, s9 ; GFX7-NEXT: v_mul_hi_u32 v5, s1, v4 ; GFX7-NEXT: s_add_u32 s18, s20, s18 ; GFX7-NEXT: v_readfirstlane_b32 s25, v3 ; GFX7-NEXT: v_mul_hi_u32 v3, v2, s10 ; GFX7-NEXT: s_addc_u32 s19, s21, s19 ; GFX7-NEXT: s_mul_i32 s21, s2, s8 ; GFX7-NEXT: s_cselect_b32 s20, 1, 0 ; GFX7-NEXT: s_add_u32 s18, s21, s18 ; GFX7-NEXT: v_readfirstlane_b32 s28, v3 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_addc_u32 s19, s22, s19 ; GFX7-NEXT: s_mul_i32 s22, s16, s9 ; GFX7-NEXT: v_readfirstlane_b32 s27, v5 ; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9 ; GFX7-NEXT: s_cselect_b32 s21, 1, 0 ; GFX7-NEXT: s_add_u32 s17, s22, s17 ; GFX7-NEXT: s_addc_u32 s18, s23, s18 ; GFX7-NEXT: s_mul_i32 s23, s1, s8 ; GFX7-NEXT: s_cselect_b32 s22, 1, 0 ; GFX7-NEXT: s_add_u32 s17, s23, s17 ; GFX7-NEXT: s_addc_u32 s18, s24, s18 ; GFX7-NEXT: s_mul_i32 s24, s16, s12 ; GFX7-NEXT: s_mul_i32 s26, s1, s11 ; GFX7-NEXT: v_readfirstlane_b32 s29, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: s_cselect_b32 s23, 1, 0 ; GFX7-NEXT: s_add_u32 s24, s26, s24 ; GFX7-NEXT: v_mul_hi_u32 v6, v5, s8 ; GFX7-NEXT: s_addc_u32 s25, s27, s25 ; GFX7-NEXT: s_mul_i32 s27, s2, s10 ; GFX7-NEXT: s_cselect_b32 s26, 1, 0 ; GFX7-NEXT: s_add_u32 s24, s27, s24 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s10 ; GFX7-NEXT: s_addc_u32 s25, s28, s25 ; GFX7-NEXT: s_mul_i32 s28, s3, s9 ; GFX7-NEXT: s_cselect_b32 s27, 1, 0 ; GFX7-NEXT: s_add_u32 s24, s28, s24 ; GFX7-NEXT: v_readfirstlane_b32 s30, v6 ; GFX7-NEXT: v_mul_hi_u32 v6, s16, v4 ; GFX7-NEXT: s_addc_u32 s25, s29, s25 ; GFX7-NEXT: s_mul_i32 s29, s4, s8 ; GFX7-NEXT: s_cselect_b32 s28, 1, 0 ; GFX7-NEXT: s_add_u32 s24, s29, s24 ; GFX7-NEXT: v_readfirstlane_b32 s33, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v2, s9 ; GFX7-NEXT: s_addc_u32 s25, s30, s25 ; GFX7-NEXT: s_mul_i32 s30, s16, s11 ; GFX7-NEXT: s_cselect_b32 s29, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s31, v6 ; GFX7-NEXT: s_add_u32 s19, s30, s19 ; GFX7-NEXT: s_addc_u32 s24, s31, s24 ; GFX7-NEXT: s_mul_i32 s31, s1, s10 ; GFX7-NEXT: s_cselect_b32 s30, 1, 0 ; GFX7-NEXT: s_add_u32 s19, s31, s19 ; GFX7-NEXT: v_readfirstlane_b32 s34, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v3, s8 ; GFX7-NEXT: s_addc_u32 s24, s33, s24 ; GFX7-NEXT: s_mul_i32 s33, s2, s9 ; GFX7-NEXT: s_cselect_b32 s31, 1, 0 ; GFX7-NEXT: s_add_u32 s19, s33, s19 ; GFX7-NEXT: s_addc_u32 s24, s34, s24 ; GFX7-NEXT: s_mul_i32 s34, s3, s8 ; GFX7-NEXT: s_cselect_b32 s33, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: s_add_u32 s19, s34, s19 ; GFX7-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-NEXT: s_addc_u32 s24, s35, s24 ; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: s_cselect_b32 s34, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s23, 0 ; GFX7-NEXT: s_addc_u32 s19, s22, s19 ; GFX7-NEXT: v_mov_b32_e32 v2, s13 ; GFX7-NEXT: s_cselect_b32 s22, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0 ; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2 ; GFX7-NEXT: s_addc_u32 s20, s20, 0 ; GFX7-NEXT: v_readfirstlane_b32 s23, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, s2, v1 ; GFX7-NEXT: s_cmp_lg_u32 s22, 0 ; GFX7-NEXT: s_addc_u32 s20, s20, s24 ; GFX7-NEXT: s_mul_i32 s22, s16, s14 ; GFX7-NEXT: s_mul_i32 s24, s1, s13 ; GFX7-NEXT: s_cselect_b32 s21, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 ; GFX7-NEXT: s_add_u32 s22, s24, s22 ; GFX7-NEXT: s_addc_u32 s23, s35, s23 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v3, s11 ; GFX7-NEXT: s_mul_i32 s24, s2, s12 ; GFX7-NEXT: s_add_u32 s22, s24, s22 ; GFX7-NEXT: s_addc_u32 s23, s35, s23 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v5, s10 ; GFX7-NEXT: s_mul_i32 s24, s3, s11 ; GFX7-NEXT: s_add_u32 s22, s24, s22 ; GFX7-NEXT: s_addc_u32 s23, s35, s23 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mul_hi_u32 v6, v0, s9 ; GFX7-NEXT: s_mul_i32 s24, s4, s10 ; GFX7-NEXT: s_add_u32 s22, s24, s22 ; GFX7-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX7-NEXT: s_addc_u32 s23, s35, s23 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: v_mul_hi_u32 v6, v6, s8 ; GFX7-NEXT: s_mul_i32 s24, s5, s9 ; GFX7-NEXT: s_add_u32 s22, s24, s22 ; GFX7-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX7-NEXT: v_readfirstlane_b32 s36, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, s2, v4 ; GFX7-NEXT: s_addc_u32 s23, s35, s23 ; GFX7-NEXT: s_mul_i32 s24, s6, s8 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 ; GFX7-NEXT: s_add_u32 s22, s24, s22 ; GFX7-NEXT: s_addc_u32 s23, s35, s23 ; GFX7-NEXT: s_mul_i32 s24, s16, s13 ; GFX7-NEXT: v_readfirstlane_b32 s35, v2 ; GFX7-NEXT: s_add_u32 s24, s24, s25 ; GFX7-NEXT: v_readfirstlane_b32 s37, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, v3, s10 ; GFX7-NEXT: s_addc_u32 s22, s35, s22 ; GFX7-NEXT: s_mul_i32 s35, s1, s12 ; GFX7-NEXT: s_cselect_b32 s25, 1, 0 ; GFX7-NEXT: s_add_u32 s24, s35, s24 ; GFX7-NEXT: s_addc_u32 s22, s36, s22 ; GFX7-NEXT: s_mul_i32 s36, s2, s11 ; GFX7-NEXT: s_cselect_b32 s35, 1, 0 ; GFX7-NEXT: s_add_u32 s24, s36, s24 ; GFX7-NEXT: v_readfirstlane_b32 s38, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, v5, s9 ; GFX7-NEXT: s_addc_u32 s22, s37, s22 ; GFX7-NEXT: s_mul_i32 s37, s3, s10 ; GFX7-NEXT: s_cselect_b32 s36, 1, 0 ; GFX7-NEXT: s_add_u32 s24, s37, s24 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s8 ; GFX7-NEXT: s_addc_u32 s22, s38, s22 ; GFX7-NEXT: s_mul_i32 s38, s4, s9 ; GFX7-NEXT: s_cselect_b32 s37, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s39, v1 ; GFX7-NEXT: s_add_u32 s24, s38, s24 ; GFX7-NEXT: s_addc_u32 s22, s39, s22 ; GFX7-NEXT: s_mul_i32 s39, s5, s8 ; GFX7-NEXT: s_cselect_b32 s38, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s40, v0 ; GFX7-NEXT: s_add_u32 s24, s39, s24 ; GFX7-NEXT: s_addc_u32 s22, s40, s22 ; GFX7-NEXT: s_cselect_b32 s39, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s31, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 ; GFX7-NEXT: s_cmp_lg_u32 s33, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 ; GFX7-NEXT: s_cmp_lg_u32 s34, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0 ; GFX7-NEXT: s_addc_u32 s21, s30, s24 ; GFX7-NEXT: s_cselect_b32 s24, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s27, 0 ; GFX7-NEXT: s_addc_u32 s26, s26, 0 ; GFX7-NEXT: s_cmp_lg_u32 s28, 0 ; GFX7-NEXT: s_addc_u32 s26, s26, 0 ; GFX7-NEXT: s_cmp_lg_u32 s29, 0 ; GFX7-NEXT: s_addc_u32 s26, s26, 0 ; GFX7-NEXT: s_cmp_lg_u32 s24, 0 ; GFX7-NEXT: s_addc_u32 s22, s26, s22 ; GFX7-NEXT: s_mul_i32 s16, s16, s15 ; GFX7-NEXT: s_addc_u32 s15, s23, s16 ; GFX7-NEXT: s_mul_i32 s1, s1, s14 ; GFX7-NEXT: s_cmp_lg_u32 s39, 0 ; GFX7-NEXT: s_addc_u32 s1, s15, s1 ; GFX7-NEXT: s_mul_i32 s2, s2, s13 ; GFX7-NEXT: s_cmp_lg_u32 s38, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s2 ; GFX7-NEXT: s_mul_i32 s3, s3, s12 ; GFX7-NEXT: s_cmp_lg_u32 s37, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_mul_i32 s4, s4, s11 ; GFX7-NEXT: s_cmp_lg_u32 s36, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s4 ; GFX7-NEXT: s_mul_i32 s5, s5, s10 ; GFX7-NEXT: s_cmp_lg_u32 s35, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_mul_i32 s6, s6, s9 ; GFX7-NEXT: s_cmp_lg_u32 s25, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s6 ; GFX7-NEXT: s_mul_i32 s7, s7, s8 ; GFX7-NEXT: s_mul_i32 s0, s0, s8 ; GFX7-NEXT: s_add_u32 s7, s7, s1 ; GFX7-NEXT: s_mov_b32 s1, s17 ; GFX7-NEXT: s_mov_b32 s2, s18 ; GFX7-NEXT: s_mov_b32 s3, s19 ; GFX7-NEXT: s_mov_b32 s4, s20 ; GFX7-NEXT: s_mov_b32 s5, s21 ; GFX7-NEXT: s_mov_b32 s6, s22 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s16, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mul_hi_u32 v2, s1, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s16, v1 ; GFX8-NEXT: v_readfirstlane_b32 s17, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: v_readfirstlane_b32 s21, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_readfirstlane_b32 s23, v1 ; GFX8-NEXT: v_readfirstlane_b32 s19, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v4, s11 ; GFX8-NEXT: s_mul_i32 s18, s16, s10 ; GFX8-NEXT: v_readfirstlane_b32 s24, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s12 ; GFX8-NEXT: v_readfirstlane_b32 s22, v3 ; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX8-NEXT: s_mul_i32 s20, s1, s9 ; GFX8-NEXT: v_mul_hi_u32 v5, s1, v4 ; GFX8-NEXT: s_add_u32 s18, s20, s18 ; GFX8-NEXT: v_readfirstlane_b32 s25, v3 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, s10 ; GFX8-NEXT: s_addc_u32 s19, s21, s19 ; GFX8-NEXT: s_mul_i32 s21, s2, s8 ; GFX8-NEXT: s_cselect_b32 s20, 1, 0 ; GFX8-NEXT: s_add_u32 s18, s21, s18 ; GFX8-NEXT: v_readfirstlane_b32 s28, v3 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s19, s22, s19 ; GFX8-NEXT: s_mul_i32 s22, s16, s9 ; GFX8-NEXT: v_readfirstlane_b32 s27, v5 ; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 ; GFX8-NEXT: s_add_u32 s17, s22, s17 ; GFX8-NEXT: s_addc_u32 s18, s23, s18 ; GFX8-NEXT: s_mul_i32 s23, s1, s8 ; GFX8-NEXT: s_cselect_b32 s22, 1, 0 ; GFX8-NEXT: s_add_u32 s17, s23, s17 ; GFX8-NEXT: s_addc_u32 s18, s24, s18 ; GFX8-NEXT: s_mul_i32 s24, s16, s12 ; GFX8-NEXT: s_mul_i32 s26, s1, s11 ; GFX8-NEXT: v_readfirstlane_b32 s29, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: s_cselect_b32 s23, 1, 0 ; GFX8-NEXT: s_add_u32 s24, s26, s24 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, s8 ; GFX8-NEXT: s_addc_u32 s25, s27, s25 ; GFX8-NEXT: s_mul_i32 s27, s2, s10 ; GFX8-NEXT: s_cselect_b32 s26, 1, 0 ; GFX8-NEXT: s_add_u32 s24, s27, s24 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s10 ; GFX8-NEXT: s_addc_u32 s25, s28, s25 ; GFX8-NEXT: s_mul_i32 s28, s3, s9 ; GFX8-NEXT: s_cselect_b32 s27, 1, 0 ; GFX8-NEXT: s_add_u32 s24, s28, s24 ; GFX8-NEXT: v_readfirstlane_b32 s30, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, s16, v4 ; GFX8-NEXT: s_addc_u32 s25, s29, s25 ; GFX8-NEXT: s_mul_i32 s29, s4, s8 ; GFX8-NEXT: s_cselect_b32 s28, 1, 0 ; GFX8-NEXT: s_add_u32 s24, s29, s24 ; GFX8-NEXT: v_readfirstlane_b32 s33, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v2, s9 ; GFX8-NEXT: s_addc_u32 s25, s30, s25 ; GFX8-NEXT: s_mul_i32 s30, s16, s11 ; GFX8-NEXT: s_cselect_b32 s29, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s31, v6 ; GFX8-NEXT: s_add_u32 s19, s30, s19 ; GFX8-NEXT: s_addc_u32 s24, s31, s24 ; GFX8-NEXT: s_mul_i32 s31, s1, s10 ; GFX8-NEXT: s_cselect_b32 s30, 1, 0 ; GFX8-NEXT: s_add_u32 s19, s31, s19 ; GFX8-NEXT: v_readfirstlane_b32 s34, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v3, s8 ; GFX8-NEXT: s_addc_u32 s24, s33, s24 ; GFX8-NEXT: s_mul_i32 s33, s2, s9 ; GFX8-NEXT: s_cselect_b32 s31, 1, 0 ; GFX8-NEXT: s_add_u32 s19, s33, s19 ; GFX8-NEXT: s_addc_u32 s24, s34, s24 ; GFX8-NEXT: s_mul_i32 s34, s3, s8 ; GFX8-NEXT: s_cselect_b32 s33, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: s_add_u32 s19, s34, s19 ; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: s_addc_u32 s24, s35, s24 ; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: s_cselect_b32 s34, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_addc_u32 s19, s22, s19 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NEXT: s_cselect_b32 s22, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 ; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2 ; GFX8-NEXT: s_addc_u32 s20, s20, 0 ; GFX8-NEXT: v_readfirstlane_b32 s23, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v1 ; GFX8-NEXT: s_cmp_lg_u32 s22, 0 ; GFX8-NEXT: s_addc_u32 s20, s20, s24 ; GFX8-NEXT: s_mul_i32 s22, s16, s14 ; GFX8-NEXT: s_mul_i32 s24, s1, s13 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 ; GFX8-NEXT: s_add_u32 s22, s24, s22 ; GFX8-NEXT: s_addc_u32 s23, s35, s23 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v3, s11 ; GFX8-NEXT: s_mul_i32 s24, s2, s12 ; GFX8-NEXT: s_add_u32 s22, s24, s22 ; GFX8-NEXT: s_addc_u32 s23, s35, s23 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v5, s10 ; GFX8-NEXT: s_mul_i32 s24, s3, s11 ; GFX8-NEXT: s_add_u32 s22, s24, s22 ; GFX8-NEXT: s_addc_u32 s23, s35, s23 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mul_hi_u32 v6, v0, s9 ; GFX8-NEXT: s_mul_i32 s24, s4, s10 ; GFX8-NEXT: s_add_u32 s22, s24, s22 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX8-NEXT: s_addc_u32 s23, s35, s23 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_mul_hi_u32 v6, v6, s8 ; GFX8-NEXT: s_mul_i32 s24, s5, s9 ; GFX8-NEXT: s_add_u32 s22, s24, s22 ; GFX8-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX8-NEXT: v_readfirstlane_b32 s36, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s2, v4 ; GFX8-NEXT: s_addc_u32 s23, s35, s23 ; GFX8-NEXT: s_mul_i32 s24, s6, s8 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 ; GFX8-NEXT: s_add_u32 s22, s24, s22 ; GFX8-NEXT: s_addc_u32 s23, s35, s23 ; GFX8-NEXT: s_mul_i32 s24, s16, s13 ; GFX8-NEXT: v_readfirstlane_b32 s35, v2 ; GFX8-NEXT: s_add_u32 s24, s24, s25 ; GFX8-NEXT: v_readfirstlane_b32 s37, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v3, s10 ; GFX8-NEXT: s_addc_u32 s22, s35, s22 ; GFX8-NEXT: s_mul_i32 s35, s1, s12 ; GFX8-NEXT: s_cselect_b32 s25, 1, 0 ; GFX8-NEXT: s_add_u32 s24, s35, s24 ; GFX8-NEXT: s_addc_u32 s22, s36, s22 ; GFX8-NEXT: s_mul_i32 s36, s2, s11 ; GFX8-NEXT: s_cselect_b32 s35, 1, 0 ; GFX8-NEXT: s_add_u32 s24, s36, s24 ; GFX8-NEXT: v_readfirstlane_b32 s38, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v5, s9 ; GFX8-NEXT: s_addc_u32 s22, s37, s22 ; GFX8-NEXT: s_mul_i32 s37, s3, s10 ; GFX8-NEXT: s_cselect_b32 s36, 1, 0 ; GFX8-NEXT: s_add_u32 s24, s37, s24 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s8 ; GFX8-NEXT: s_addc_u32 s22, s38, s22 ; GFX8-NEXT: s_mul_i32 s38, s4, s9 ; GFX8-NEXT: s_cselect_b32 s37, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s39, v1 ; GFX8-NEXT: s_add_u32 s24, s38, s24 ; GFX8-NEXT: s_addc_u32 s22, s39, s22 ; GFX8-NEXT: s_mul_i32 s39, s5, s8 ; GFX8-NEXT: s_cselect_b32 s38, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s40, v0 ; GFX8-NEXT: s_add_u32 s24, s39, s24 ; GFX8-NEXT: s_addc_u32 s22, s40, s22 ; GFX8-NEXT: s_cselect_b32 s39, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s31, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 ; GFX8-NEXT: s_cmp_lg_u32 s33, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 ; GFX8-NEXT: s_cmp_lg_u32 s34, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 ; GFX8-NEXT: s_addc_u32 s21, s30, s24 ; GFX8-NEXT: s_cselect_b32 s24, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 ; GFX8-NEXT: s_addc_u32 s26, s26, 0 ; GFX8-NEXT: s_cmp_lg_u32 s28, 0 ; GFX8-NEXT: s_addc_u32 s26, s26, 0 ; GFX8-NEXT: s_cmp_lg_u32 s29, 0 ; GFX8-NEXT: s_addc_u32 s26, s26, 0 ; GFX8-NEXT: s_cmp_lg_u32 s24, 0 ; GFX8-NEXT: s_addc_u32 s22, s26, s22 ; GFX8-NEXT: s_mul_i32 s16, s16, s15 ; GFX8-NEXT: s_addc_u32 s15, s23, s16 ; GFX8-NEXT: s_mul_i32 s1, s1, s14 ; GFX8-NEXT: s_cmp_lg_u32 s39, 0 ; GFX8-NEXT: s_addc_u32 s1, s15, s1 ; GFX8-NEXT: s_mul_i32 s2, s2, s13 ; GFX8-NEXT: s_cmp_lg_u32 s38, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s2 ; GFX8-NEXT: s_mul_i32 s3, s3, s12 ; GFX8-NEXT: s_cmp_lg_u32 s37, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_mul_i32 s4, s4, s11 ; GFX8-NEXT: s_cmp_lg_u32 s36, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s4 ; GFX8-NEXT: s_mul_i32 s5, s5, s10 ; GFX8-NEXT: s_cmp_lg_u32 s35, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_mul_i32 s6, s6, s9 ; GFX8-NEXT: s_cmp_lg_u32 s25, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s6 ; GFX8-NEXT: s_mul_i32 s7, s7, s8 ; GFX8-NEXT: s_mul_i32 s0, s0, s8 ; GFX8-NEXT: s_add_u32 s7, s7, s1 ; GFX8-NEXT: s_mov_b32 s1, s17 ; GFX8-NEXT: s_mov_b32 s2, s18 ; GFX8-NEXT: s_mov_b32 s3, s19 ; GFX8-NEXT: s_mov_b32 s4, s20 ; GFX8-NEXT: s_mov_b32 s5, s21 ; GFX8-NEXT: s_mov_b32 s6, s22 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mul_i32 s18, s0, s10 ; GFX9-NEXT: s_mul_i32 s20, s1, s9 ; GFX9-NEXT: s_mul_hi_u32 s19, s0, s10 ; GFX9-NEXT: s_mul_hi_u32 s21, s1, s9 ; GFX9-NEXT: s_add_u32 s18, s20, s18 ; GFX9-NEXT: s_addc_u32 s19, s21, s19 ; GFX9-NEXT: s_mul_i32 s21, s2, s8 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s22, s2, s8 ; GFX9-NEXT: s_add_u32 s18, s21, s18 ; GFX9-NEXT: s_mul_hi_u32 s17, s0, s8 ; GFX9-NEXT: s_addc_u32 s19, s22, s19 ; GFX9-NEXT: s_mul_i32 s22, s0, s9 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s23, s0, s9 ; GFX9-NEXT: s_add_u32 s17, s22, s17 ; GFX9-NEXT: s_addc_u32 s18, s23, s18 ; GFX9-NEXT: s_mul_i32 s23, s1, s8 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s24, s1, s8 ; GFX9-NEXT: s_add_u32 s17, s23, s17 ; GFX9-NEXT: s_addc_u32 s18, s24, s18 ; GFX9-NEXT: s_mul_i32 s24, s0, s12 ; GFX9-NEXT: s_mul_i32 s26, s1, s11 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s25, s0, s12 ; GFX9-NEXT: s_mul_hi_u32 s27, s1, s11 ; GFX9-NEXT: s_add_u32 s24, s26, s24 ; GFX9-NEXT: s_addc_u32 s25, s27, s25 ; GFX9-NEXT: s_mul_i32 s27, s2, s10 ; GFX9-NEXT: s_cselect_b32 s26, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s28, s2, s10 ; GFX9-NEXT: s_add_u32 s24, s27, s24 ; GFX9-NEXT: s_addc_u32 s25, s28, s25 ; GFX9-NEXT: s_mul_i32 s28, s3, s9 ; GFX9-NEXT: s_cselect_b32 s27, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s29, s3, s9 ; GFX9-NEXT: s_add_u32 s24, s28, s24 ; GFX9-NEXT: s_addc_u32 s25, s29, s25 ; GFX9-NEXT: s_mul_i32 s29, s4, s8 ; GFX9-NEXT: s_cselect_b32 s28, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s30, s4, s8 ; GFX9-NEXT: s_add_u32 s24, s29, s24 ; GFX9-NEXT: s_addc_u32 s25, s30, s25 ; GFX9-NEXT: s_mul_i32 s30, s0, s11 ; GFX9-NEXT: s_cselect_b32 s29, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s31, s0, s11 ; GFX9-NEXT: s_add_u32 s19, s30, s19 ; GFX9-NEXT: s_addc_u32 s24, s31, s24 ; GFX9-NEXT: s_mul_i32 s31, s1, s10 ; GFX9-NEXT: s_cselect_b32 s30, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s33, s1, s10 ; GFX9-NEXT: s_add_u32 s19, s31, s19 ; GFX9-NEXT: s_addc_u32 s24, s33, s24 ; GFX9-NEXT: s_mul_i32 s33, s2, s9 ; GFX9-NEXT: s_cselect_b32 s31, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s34, s2, s9 ; GFX9-NEXT: s_add_u32 s19, s33, s19 ; GFX9-NEXT: s_addc_u32 s24, s34, s24 ; GFX9-NEXT: s_mul_i32 s34, s3, s8 ; GFX9-NEXT: s_cselect_b32 s33, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s35, s3, s8 ; GFX9-NEXT: s_add_u32 s19, s34, s19 ; GFX9-NEXT: s_addc_u32 s24, s35, s24 ; GFX9-NEXT: s_cselect_b32 s34, 1, 0 ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_addc_u32 s19, s22, s19 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 ; GFX9-NEXT: s_addc_u32 s20, s20, 0 ; GFX9-NEXT: s_cmp_lg_u32 s22, 0 ; GFX9-NEXT: s_addc_u32 s20, s20, s24 ; GFX9-NEXT: s_mul_i32 s22, s0, s14 ; GFX9-NEXT: s_mul_i32 s24, s1, s13 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s23, s0, s14 ; GFX9-NEXT: s_mul_hi_u32 s35, s1, s13 ; GFX9-NEXT: s_add_u32 s22, s24, s22 ; GFX9-NEXT: s_addc_u32 s23, s35, s23 ; GFX9-NEXT: s_mul_i32 s24, s2, s12 ; GFX9-NEXT: s_mul_hi_u32 s35, s2, s12 ; GFX9-NEXT: s_add_u32 s22, s24, s22 ; GFX9-NEXT: s_addc_u32 s23, s35, s23 ; GFX9-NEXT: s_mul_i32 s24, s3, s11 ; GFX9-NEXT: s_mul_hi_u32 s35, s3, s11 ; GFX9-NEXT: s_add_u32 s22, s24, s22 ; GFX9-NEXT: s_addc_u32 s23, s35, s23 ; GFX9-NEXT: s_mul_i32 s24, s4, s10 ; GFX9-NEXT: s_mul_hi_u32 s35, s4, s10 ; GFX9-NEXT: s_add_u32 s22, s24, s22 ; GFX9-NEXT: s_addc_u32 s23, s35, s23 ; GFX9-NEXT: s_mul_i32 s24, s5, s9 ; GFX9-NEXT: s_mul_hi_u32 s35, s5, s9 ; GFX9-NEXT: s_add_u32 s22, s24, s22 ; GFX9-NEXT: s_addc_u32 s23, s35, s23 ; GFX9-NEXT: s_mul_i32 s24, s6, s8 ; GFX9-NEXT: s_mul_hi_u32 s35, s6, s8 ; GFX9-NEXT: s_add_u32 s22, s24, s22 ; GFX9-NEXT: s_addc_u32 s23, s35, s23 ; GFX9-NEXT: s_mul_i32 s24, s0, s13 ; GFX9-NEXT: s_mul_hi_u32 s35, s0, s13 ; GFX9-NEXT: s_add_u32 s24, s24, s25 ; GFX9-NEXT: s_addc_u32 s22, s35, s22 ; GFX9-NEXT: s_mul_i32 s35, s1, s12 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s36, s1, s12 ; GFX9-NEXT: s_add_u32 s24, s35, s24 ; GFX9-NEXT: s_addc_u32 s22, s36, s22 ; GFX9-NEXT: s_mul_i32 s36, s2, s11 ; GFX9-NEXT: s_cselect_b32 s35, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s37, s2, s11 ; GFX9-NEXT: s_add_u32 s24, s36, s24 ; GFX9-NEXT: s_addc_u32 s22, s37, s22 ; GFX9-NEXT: s_mul_i32 s37, s3, s10 ; GFX9-NEXT: s_cselect_b32 s36, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s38, s3, s10 ; GFX9-NEXT: s_add_u32 s24, s37, s24 ; GFX9-NEXT: s_addc_u32 s22, s38, s22 ; GFX9-NEXT: s_mul_i32 s38, s4, s9 ; GFX9-NEXT: s_cselect_b32 s37, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s39, s4, s9 ; GFX9-NEXT: s_add_u32 s24, s38, s24 ; GFX9-NEXT: s_addc_u32 s22, s39, s22 ; GFX9-NEXT: s_mul_i32 s39, s5, s8 ; GFX9-NEXT: s_cselect_b32 s38, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s40, s5, s8 ; GFX9-NEXT: s_add_u32 s24, s39, s24 ; GFX9-NEXT: s_addc_u32 s22, s40, s22 ; GFX9-NEXT: s_cselect_b32 s39, 1, 0 ; GFX9-NEXT: s_cmp_lg_u32 s31, 0 ; GFX9-NEXT: s_addc_u32 s30, s30, 0 ; GFX9-NEXT: s_cmp_lg_u32 s33, 0 ; GFX9-NEXT: s_addc_u32 s30, s30, 0 ; GFX9-NEXT: s_cmp_lg_u32 s34, 0 ; GFX9-NEXT: s_addc_u32 s30, s30, 0 ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 ; GFX9-NEXT: s_addc_u32 s21, s30, s24 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 ; GFX9-NEXT: s_cmp_lg_u32 s28, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 ; GFX9-NEXT: s_cmp_lg_u32 s29, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 ; GFX9-NEXT: s_cmp_lg_u32 s24, 0 ; GFX9-NEXT: s_mul_i32 s16, s0, s8 ; GFX9-NEXT: s_addc_u32 s22, s26, s22 ; GFX9-NEXT: s_mul_i32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s0, s23, s0 ; GFX9-NEXT: s_mul_i32 s1, s1, s14 ; GFX9-NEXT: s_cmp_lg_u32 s39, 0 ; GFX9-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-NEXT: s_mul_i32 s2, s2, s13 ; GFX9-NEXT: s_cmp_lg_u32 s38, 0 ; GFX9-NEXT: s_addc_u32 s0, s0, s2 ; GFX9-NEXT: s_mul_i32 s3, s3, s12 ; GFX9-NEXT: s_cmp_lg_u32 s37, 0 ; GFX9-NEXT: s_addc_u32 s0, s0, s3 ; GFX9-NEXT: s_mul_i32 s4, s4, s11 ; GFX9-NEXT: s_cmp_lg_u32 s36, 0 ; GFX9-NEXT: s_addc_u32 s0, s0, s4 ; GFX9-NEXT: s_mul_i32 s5, s5, s10 ; GFX9-NEXT: s_cmp_lg_u32 s35, 0 ; GFX9-NEXT: s_addc_u32 s0, s0, s5 ; GFX9-NEXT: s_mul_i32 s6, s6, s9 ; GFX9-NEXT: s_cmp_lg_u32 s25, 0 ; GFX9-NEXT: s_addc_u32 s0, s0, s6 ; GFX9-NEXT: s_mul_i32 s7, s7, s8 ; GFX9-NEXT: s_add_u32 s7, s7, s0 ; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: s_mov_b32 s1, s17 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s3, s19 ; GFX9-NEXT: s_mov_b32 s4, s20 ; GFX9-NEXT: s_mov_b32 s5, s21 ; GFX9-NEXT: s_mov_b32 s6, s22 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i256: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_mul_i32 s17, s0, s10 ; GFX10PLUS-NEXT: s_mul_i32 s19, s1, s9 ; GFX10PLUS-NEXT: s_mul_hi_u32 s18, s0, s10 ; GFX10PLUS-NEXT: s_mul_hi_u32 s20, s1, s9 ; GFX10PLUS-NEXT: s_add_u32 s17, s19, s17 ; GFX10PLUS-NEXT: s_addc_u32 s18, s20, s18 ; GFX10PLUS-NEXT: s_mul_i32 s20, s2, s8 ; GFX10PLUS-NEXT: s_mul_hi_u32 s21, s2, s8 ; GFX10PLUS-NEXT: s_cselect_b32 s19, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s17, s20, s17 ; GFX10PLUS-NEXT: s_mul_hi_u32 s16, s0, s8 ; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18 ; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s9 ; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s9 ; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s16, s21, s16 ; GFX10PLUS-NEXT: s_addc_u32 s17, s22, s17 ; GFX10PLUS-NEXT: s_mul_i32 s22, s1, s8 ; GFX10PLUS-NEXT: s_mul_hi_u32 s23, s1, s8 ; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s16, s22, s16 ; GFX10PLUS-NEXT: s_addc_u32 s17, s23, s17 ; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s12 ; GFX10PLUS-NEXT: s_mul_i32 s25, s1, s11 ; GFX10PLUS-NEXT: s_mul_hi_u32 s24, s0, s12 ; GFX10PLUS-NEXT: s_mul_hi_u32 s26, s1, s11 ; GFX10PLUS-NEXT: s_cselect_b32 s22, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s23, s25, s23 ; GFX10PLUS-NEXT: s_addc_u32 s24, s26, s24 ; GFX10PLUS-NEXT: s_mul_i32 s26, s2, s10 ; GFX10PLUS-NEXT: s_mul_hi_u32 s27, s2, s10 ; GFX10PLUS-NEXT: s_cselect_b32 s25, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s23, s26, s23 ; GFX10PLUS-NEXT: s_addc_u32 s24, s27, s24 ; GFX10PLUS-NEXT: s_mul_i32 s27, s3, s9 ; GFX10PLUS-NEXT: s_mul_hi_u32 s28, s3, s9 ; GFX10PLUS-NEXT: s_cselect_b32 s26, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s23, s27, s23 ; GFX10PLUS-NEXT: s_addc_u32 s24, s28, s24 ; GFX10PLUS-NEXT: s_mul_i32 s28, s4, s8 ; GFX10PLUS-NEXT: s_mul_hi_u32 s29, s4, s8 ; GFX10PLUS-NEXT: s_cselect_b32 s27, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s23, s28, s23 ; GFX10PLUS-NEXT: s_addc_u32 s24, s29, s24 ; GFX10PLUS-NEXT: s_mul_i32 s29, s0, s11 ; GFX10PLUS-NEXT: s_mul_hi_u32 s30, s0, s11 ; GFX10PLUS-NEXT: s_cselect_b32 s28, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s18, s29, s18 ; GFX10PLUS-NEXT: s_addc_u32 s23, s30, s23 ; GFX10PLUS-NEXT: s_mul_i32 s30, s1, s10 ; GFX10PLUS-NEXT: s_mul_hi_u32 s31, s1, s10 ; GFX10PLUS-NEXT: s_cselect_b32 s29, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s18, s30, s18 ; GFX10PLUS-NEXT: s_addc_u32 s23, s31, s23 ; GFX10PLUS-NEXT: s_mul_i32 s31, s2, s9 ; GFX10PLUS-NEXT: s_mul_hi_u32 s33, s2, s9 ; GFX10PLUS-NEXT: s_cselect_b32 s30, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s18, s31, s18 ; GFX10PLUS-NEXT: s_addc_u32 s23, s33, s23 ; GFX10PLUS-NEXT: s_mul_i32 s33, s3, s8 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s8 ; GFX10PLUS-NEXT: s_cselect_b32 s31, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s18, s33, s18 ; GFX10PLUS-NEXT: s_addc_u32 s23, s34, s23 ; GFX10PLUS-NEXT: s_cselect_b32 s33, 1, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s22, 0 ; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s14 ; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18 ; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s1, s13 ; GFX10PLUS-NEXT: s_addc_u32 s19, s19, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s21, 0 ; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s14 ; GFX10PLUS-NEXT: s_addc_u32 s19, s19, s23 ; GFX10PLUS-NEXT: s_mul_i32 s23, s1, s13 ; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 ; GFX10PLUS-NEXT: s_mul_i32 s23, s2, s12 ; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s2, s12 ; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 ; GFX10PLUS-NEXT: s_mul_i32 s23, s3, s11 ; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s11 ; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 ; GFX10PLUS-NEXT: s_mul_i32 s23, s4, s10 ; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s4, s10 ; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 ; GFX10PLUS-NEXT: s_mul_i32 s23, s5, s9 ; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s5, s9 ; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 ; GFX10PLUS-NEXT: s_mul_i32 s23, s6, s8 ; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s6, s8 ; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 ; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s13 ; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s0, s13 ; GFX10PLUS-NEXT: s_add_u32 s23, s23, s24 ; GFX10PLUS-NEXT: s_addc_u32 s21, s34, s21 ; GFX10PLUS-NEXT: s_mul_i32 s34, s1, s12 ; GFX10PLUS-NEXT: s_mul_hi_u32 s35, s1, s12 ; GFX10PLUS-NEXT: s_cselect_b32 s24, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s23, s34, s23 ; GFX10PLUS-NEXT: s_addc_u32 s21, s35, s21 ; GFX10PLUS-NEXT: s_mul_i32 s35, s2, s11 ; GFX10PLUS-NEXT: s_mul_hi_u32 s36, s2, s11 ; GFX10PLUS-NEXT: s_cselect_b32 s34, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s23, s35, s23 ; GFX10PLUS-NEXT: s_addc_u32 s21, s36, s21 ; GFX10PLUS-NEXT: s_mul_i32 s36, s3, s10 ; GFX10PLUS-NEXT: s_mul_hi_u32 s37, s3, s10 ; GFX10PLUS-NEXT: s_cselect_b32 s35, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s23, s36, s23 ; GFX10PLUS-NEXT: s_addc_u32 s21, s37, s21 ; GFX10PLUS-NEXT: s_mul_i32 s37, s4, s9 ; GFX10PLUS-NEXT: s_mul_hi_u32 s38, s4, s9 ; GFX10PLUS-NEXT: s_cselect_b32 s36, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s23, s37, s23 ; GFX10PLUS-NEXT: s_addc_u32 s21, s38, s21 ; GFX10PLUS-NEXT: s_mul_i32 s38, s5, s8 ; GFX10PLUS-NEXT: s_mul_hi_u32 s39, s5, s8 ; GFX10PLUS-NEXT: s_cselect_b32 s37, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s23, s38, s23 ; GFX10PLUS-NEXT: s_addc_u32 s21, s39, s21 ; GFX10PLUS-NEXT: s_cselect_b32 s38, 1, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s30, 0 ; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s14 ; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s31, 0 ; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s13 ; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s33, 0 ; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s12 ; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 ; GFX10PLUS-NEXT: s_mul_i32 s4, s4, s11 ; GFX10PLUS-NEXT: s_addc_u32 s20, s29, s23 ; GFX10PLUS-NEXT: s_cselect_b32 s23, 1, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s26, 0 ; GFX10PLUS-NEXT: s_mul_i32 s26, s0, s15 ; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s27, 0 ; GFX10PLUS-NEXT: s_mul_i32 s5, s5, s10 ; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s28, 0 ; GFX10PLUS-NEXT: s_mul_i32 s6, s6, s9 ; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s23, 0 ; GFX10PLUS-NEXT: s_mul_i32 s7, s7, s8 ; GFX10PLUS-NEXT: s_addc_u32 s15, s25, s21 ; GFX10PLUS-NEXT: s_addc_u32 s21, s22, s26 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s38, 0 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s8 ; GFX10PLUS-NEXT: s_addc_u32 s1, s21, s1 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s37, 0 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s2 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s36, 0 ; GFX10PLUS-NEXT: s_mov_b32 s2, s17 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s35, 0 ; GFX10PLUS-NEXT: s_mov_b32 s3, s18 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s4 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s34, 0 ; GFX10PLUS-NEXT: s_mov_b32 s4, s19 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s24, 0 ; GFX10PLUS-NEXT: s_mov_b32 s5, s20 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s6 ; GFX10PLUS-NEXT: s_mov_b32 s6, s15 ; GFX10PLUS-NEXT: s_add_i32 s7, s1, s7 ; GFX10PLUS-NEXT: s_mov_b32 s1, s16 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i256 %num, %den %cast = bitcast i256 %result to <8 x i32> ret <8 x i32> %cast } define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7-LABEL: v_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v16, v0 ; GFX7-NEXT: v_mov_b32_e32 v17, v1 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] ; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12 ; GFX7-NEXT: v_mul_lo_u32 v26, v5, v10 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1] ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1] ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19] ; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] ; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] ; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc ; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] ; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v20, vcc ; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] ; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v22, vcc ; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0 ; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1] ; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23] ; GFX7-NEXT: v_mov_b32_e32 v1, v18 ; GFX7-NEXT: v_mov_b32_e32 v18, v19 ; GFX7-NEXT: v_mov_b32_e32 v19, v20 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19] ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23] ; GFX7-NEXT: v_addc_u32_e64 v25, s[4:5], 0, v0, s[4:5] ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19] ; GFX7-NEXT: v_mov_b32_e32 v0, v23 ; GFX7-NEXT: v_mul_lo_u32 v23, v4, v11 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1] ; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] ; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13 ; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9 ; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12] ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1] ; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1] ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, v22 ; GFX7-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v4, s[12:13] ; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2] ; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] ; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12] ; GFX7-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v10, s[12:13] ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[16:17] ; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2] ; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v11, v3, s[12:13] ; GFX7-NEXT: v_mul_lo_u32 v11, v16, v15 ; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14 ; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v25, v4, s[12:13] ; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v10, v5, s[12:13] ; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v24, v6, s[12:13] ; GFX7-NEXT: v_addc_u32_e64 v10, s[12:13], v21, v11, s[12:13] ; GFX7-NEXT: v_addc_u32_e64 v9, s[12:13], v10, v9, s[14:15] ; GFX7-NEXT: v_addc_u32_e64 v9, s[10:11], v9, v13, s[10:11] ; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v27, s[8:9] ; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v23, s[6:7] ; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v26, s[4:5] ; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc ; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v16, v0 ; GFX8-NEXT: v_mov_b32_e32 v17, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] ; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12 ; GFX8-NEXT: v_mul_lo_u32 v26, v5, v10 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1] ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1] ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19] ; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] ; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] ; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc ; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] ; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v20, vcc ; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] ; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v22, vcc ; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0 ; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1] ; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23] ; GFX8-NEXT: v_mov_b32_e32 v1, v18 ; GFX8-NEXT: v_mov_b32_e32 v18, v19 ; GFX8-NEXT: v_mov_b32_e32 v19, v20 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23] ; GFX8-NEXT: v_addc_u32_e64 v25, s[4:5], 0, v0, s[4:5] ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19] ; GFX8-NEXT: v_mov_b32_e32 v0, v23 ; GFX8-NEXT: v_mul_lo_u32 v23, v4, v11 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1] ; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] ; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13 ; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9 ; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12] ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1] ; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1] ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, v22 ; GFX8-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v4, s[12:13] ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2] ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] ; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12] ; GFX8-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v10, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[16:17] ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2] ; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v11, v3, s[12:13] ; GFX8-NEXT: v_mul_lo_u32 v11, v16, v15 ; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14 ; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v25, v4, s[12:13] ; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v10, v5, s[12:13] ; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v24, v6, s[12:13] ; GFX8-NEXT: v_addc_u32_e64 v10, s[12:13], v21, v11, s[12:13] ; GFX8-NEXT: v_addc_u32_e64 v9, s[12:13], v10, v9, s[14:15] ; GFX8-NEXT: v_addc_u32_e64 v9, s[10:11], v9, v13, s[10:11] ; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v27, s[8:9] ; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v23, s[6:7] ; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v26, s[4:5] ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc ; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v16, v0 ; GFX9-NEXT: v_mov_b32_e32 v17, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] ; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12 ; GFX9-NEXT: v_mul_lo_u32 v26, v5, v10 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19] ; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] ; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc ; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] ; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v20, vcc ; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v22, vcc ; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0 ; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23] ; GFX9-NEXT: v_mov_b32_e32 v1, v18 ; GFX9-NEXT: v_mov_b32_e32 v18, v19 ; GFX9-NEXT: v_mov_b32_e32 v19, v20 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23] ; GFX9-NEXT: v_addc_co_u32_e64 v25, s[4:5], 0, v0, s[4:5] ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19] ; GFX9-NEXT: v_mov_b32_e32 v0, v23 ; GFX9-NEXT: v_mul_lo_u32 v23, v4, v11 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] ; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13 ; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9 ; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12] ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1] ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, v22 ; GFX9-NEXT: v_addc_co_u32_e64 v10, s[12:13], 0, v4, s[12:13] ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2] ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12] ; GFX9-NEXT: v_addc_co_u32_e64 v10, s[12:13], 0, v10, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[16:17] ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2] ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v11, v3, s[12:13] ; GFX9-NEXT: v_mul_lo_u32 v11, v16, v15 ; GFX9-NEXT: v_mul_lo_u32 v9, v17, v14 ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v25, v4, s[12:13] ; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v10, v5, s[12:13] ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v24, v6, s[12:13] ; GFX9-NEXT: v_addc_co_u32_e64 v10, s[12:13], v21, v11, s[12:13] ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[12:13], v10, v9, s[14:15] ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[10:11], v9, v13, s[10:11] ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v27, s[8:9] ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v23, s[6:7] ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v9, v26, s[4:5] ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v20, vcc ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v16, v0 ; GFX10-NEXT: v_mov_b32_e32 v17, v1 ; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 ; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 ; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0 ; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19] ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] ; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] ; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] ; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21] ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_mov_b32_e32 v20, v22 ; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] ; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20] ; GFX10-NEXT: v_mov_b32_e32 v20, v18 ; GFX10-NEXT: v_mov_b32_e32 v19, v22 ; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 ; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 ; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 ; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25] ; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12 ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15] ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 ; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13 ; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19] ; GFX10-NEXT: v_mov_b32_e32 v13, v1 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12] ; GFX10-NEXT: v_mov_b32_e32 v14, v21 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 ; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19] ; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 ; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2] ; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8 ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11] ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13] ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9 ; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v8, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i256: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 ; GFX11-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX11-NEXT: v_mul_lo_u32 v27, v6, v9 ; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 ; GFX11-NEXT: v_mad_u64_u32 v[18:19], null, v16, v12, 0 ; GFX11-NEXT: v_mul_lo_u32 v30, v17, v14 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[18:19], s0, v17, v11, v[18:19] ; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] ; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] ; GFX11-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] ; GFX11-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v8, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21] ; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 ; GFX11-NEXT: v_mov_b32_e32 v20, v22 ; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] ; GFX11-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[19:20] ; GFX11-NEXT: v_mov_b32_e32 v20, v18 ; GFX11-NEXT: v_mov_b32_e32 v19, v22 ; GFX11-NEXT: v_mul_lo_u32 v22, v16, v15 ; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[19:20] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v8, 0 ; GFX11-NEXT: v_mul_lo_u32 v20, v4, v11 ; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 ; GFX11-NEXT: v_mad_u64_u32 v[18:19], s1, v2, v11, v[24:25] ; GFX11-NEXT: v_mul_lo_u32 v25, v3, v12 ; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15] ; GFX11-NEXT: v_mov_b32_e32 v14, v21 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 ; GFX11-NEXT: v_mad_u64_u32 v[18:19], s3, v3, v10, v[18:19] ; GFX11-NEXT: v_mul_lo_u32 v24, v2, v13 ; GFX11-NEXT: v_mov_b32_e32 v13, v1 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[11:12] ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 ; GFX11-NEXT: v_mad_u64_u32 v[10:11], s2, v4, v9, v[18:19] ; GFX11-NEXT: v_mad_u64_u32 v[12:13], s4, v16, v9, v[13:14] ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 ; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v8, v[1:2] ; GFX11-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4 ; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v8, v[10:11] ; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v8, v[12:13] ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 ; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1 ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0 ; GFX11-NEXT: v_add_nc_u32_e32 v7, v8, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = mul i256 %num, %den ret i256 %result }