Compiler projects using llvm
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s

define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GFX7-LABEL: s_mul_i16:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mul_i32 s0, s0, s1
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX8-NEXT:    s_mul_i32 s0, s0, s1
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX9-NEXT:    s_mul_i32 s0, s0, s1
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i16:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT:    ; return to shader part epilog
  %result = mul i16 %num, %den
  ret i16 %result
}

define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX7-LABEL: v_mul_i16:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_i16:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT:    v_mul_lo_u16 v0, v0, v1
; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i16 %num, %den
  ret i16 %result
}

define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
; GFX7-LABEL: s_mul_i16_zeroext:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mul_i32 s0, s0, s1
; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_zeroext:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX8-NEXT:    s_mul_i32 s0, s0, s1
; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_zeroext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX9-NEXT:    s_mul_i32 s0, s0, s1
; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i16_zeroext:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT:    ; return to shader part epilog
  %result = mul i16 %num, %den
  ret i16 %result
}

define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX7-LABEL: v_mul_i16_zeroext:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_zeroext:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_zeroext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_i16_zeroext:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT:    v_mul_lo_u16 v0, v0, v1
; GFX10PLUS-NEXT:    v_bfe_u32 v0, v0, 0, 16
; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i16 %num, %den
  ret i16 %result
}

define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
; GFX7-LABEL: s_mul_i16_signext:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mul_i32 s0, s0, s1
; GFX7-NEXT:    s_sext_i32_i16 s0, s0
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_signext:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX8-NEXT:    s_mul_i32 s0, s0, s1
; GFX8-NEXT:    s_sext_i32_i16 s0, s0
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_signext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX9-NEXT:    s_mul_i32 s0, s0, s1
; GFX9-NEXT:    s_sext_i32_i16 s0, s0
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i16_signext:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
; GFX10PLUS-NEXT:    ; return to shader part epilog
  %result = mul i16 %num, %den
  ret i16 %result
}

define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX7-LABEL: v_mul_i16_signext:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_signext:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 16
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_signext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_i16_signext:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT:    v_mul_lo_u16 v0, v0, v1
; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 16
; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i16 %num, %den
  ret i16 %result
}

define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
; GCN-LABEL: s_mul_i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_mul_i32 s0, s0, s1
; GCN-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i32:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT:    ; return to shader part epilog
  %result = mul i32 %num, %den
  ret i32 %result
}

define i32 @v_mul_i32(i32 %num, i32 %den) {
; GCN-LABEL: v_mul_i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_mul_lo_u32 v0, v0, v1
; GCN-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_i32:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT:    v_mul_lo_u32 v0, v0, v1
; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i32 %num, %den
  ret i32 %result
}

define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
; GCN-LABEL: s_mul_v2i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_mul_i32 s0, s0, s2
; GCN-NEXT:    s_mul_i32 s1, s1, s3
; GCN-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_v2i32:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s2
; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s3
; GFX10PLUS-NEXT:    ; return to shader part epilog
  %result = mul <2 x i32> %num, %den
  ret <2 x i32> %result
}

define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GCN-LABEL: v_mul_v2i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_mul_lo_u32 v0, v0, v2
; GCN-NEXT:    v_mul_lo_u32 v1, v1, v3
; GCN-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_v2i32:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT:    v_mul_lo_u32 v0, v0, v2
; GFX10PLUS-NEXT:    v_mul_lo_u32 v1, v1, v3
; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
  %result = mul <2 x i32> %num, %den
  ret <2 x i32> %result
}

define amdgpu_cs i33 @s_mul_i33(i33 inreg %num,  i33 inreg %den) {
; GFX7-LABEL: s_mul_i33:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    v_mov_b32_e32 v0, s2
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    s_mul_i32 s4, s0, s2
; GFX7-NEXT:    s_mul_i32 s0, s0, s3
; GFX7-NEXT:    s_mul_i32 s1, s1, s2
; GFX7-NEXT:    v_readfirstlane_b32 s5, v0
; GFX7-NEXT:    s_add_u32 s0, s0, s5
; GFX7-NEXT:    s_add_u32 s1, s1, s0
; GFX7-NEXT:    s_mov_b32 s0, s4
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i33:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    s_mul_i32 s4, s0, s2
; GFX8-NEXT:    s_mul_i32 s0, s0, s3
; GFX8-NEXT:    s_mul_i32 s1, s1, s2
; GFX8-NEXT:    v_readfirstlane_b32 s5, v0
; GFX8-NEXT:    s_add_u32 s0, s0, s5
; GFX8-NEXT:    s_add_u32 s1, s1, s0
; GFX8-NEXT:    s_mov_b32 s0, s4
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i33:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s4, s0, s2
; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s2
; GFX9-NEXT:    s_mul_i32 s0, s0, s3
; GFX9-NEXT:    s_add_u32 s0, s0, s5
; GFX9-NEXT:    s_mul_i32 s1, s1, s2
; GFX9-NEXT:    s_add_u32 s1, s1, s0
; GFX9-NEXT:    s_mov_b32 s0, s4
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i33:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_hi_u32 s4, s0, s2
; GFX10PLUS-NEXT:    s_mul_i32 s3, s0, s3
; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s2
; GFX10PLUS-NEXT:    s_add_i32 s3, s4, s3
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s2
; GFX10PLUS-NEXT:    s_add_i32 s1, s3, s1
; GFX10PLUS-NEXT:    ; return to shader part epilog
  %result = mul i33 %num, %den
  ret i33 %result
}

define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
; GFX7-LABEL: s_mul_i64:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    v_mov_b32_e32 v0, s2
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    s_mul_i32 s4, s0, s2
; GFX7-NEXT:    s_mul_i32 s0, s0, s3
; GFX7-NEXT:    s_mul_i32 s1, s1, s2
; GFX7-NEXT:    v_readfirstlane_b32 s5, v0
; GFX7-NEXT:    s_add_u32 s0, s0, s5
; GFX7-NEXT:    s_add_u32 s1, s1, s0
; GFX7-NEXT:    s_mov_b32 s0, s4
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i64:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    s_mul_i32 s4, s0, s2
; GFX8-NEXT:    s_mul_i32 s0, s0, s3
; GFX8-NEXT:    s_mul_i32 s1, s1, s2
; GFX8-NEXT:    v_readfirstlane_b32 s5, v0
; GFX8-NEXT:    s_add_u32 s0, s0, s5
; GFX8-NEXT:    s_add_u32 s1, s1, s0
; GFX8-NEXT:    s_mov_b32 s0, s4
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i64:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s4, s0, s2
; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s2
; GFX9-NEXT:    s_mul_i32 s0, s0, s3
; GFX9-NEXT:    s_add_u32 s0, s0, s5
; GFX9-NEXT:    s_mul_i32 s1, s1, s2
; GFX9-NEXT:    s_add_u32 s1, s1, s0
; GFX9-NEXT:    s_mov_b32 s0, s4
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i64:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_hi_u32 s4, s0, s2
; GFX10PLUS-NEXT:    s_mul_i32 s3, s0, s3
; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s2
; GFX10PLUS-NEXT:    s_add_i32 s3, s4, s3
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s2
; GFX10PLUS-NEXT:    s_add_i32 s1, s3, s1
; GFX10PLUS-NEXT:    ; return to shader part epilog
  %result = mul i64 %num, %den
  ret i64 %result
}

define i64 @v_mul_i64(i64 %num, i64 %den) {
; GCN-LABEL: v_mul_i64:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v4, v0
; GCN-NEXT:    v_mov_b32_e32 v5, v1
; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
; GCN-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i64:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_mov_b32_e32 v4, v0
; GFX10-NEXT:    v_mov_b32_e32 v5, v1
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
; GFX10-NEXT:    v_mul_lo_u32 v3, v4, v3
; GFX10-NEXT:    v_mul_lo_u32 v2, v5, v2
; GFX10-NEXT:    v_add3_u32 v1, v1, v3, v2
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i64:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX11-NEXT:    v_mul_lo_u32 v3, v4, v3
; GFX11-NEXT:    v_mul_lo_u32 v2, v5, v2
; GFX11-NEXT:    v_add3_u32 v1, v1, v3, v2
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i64 %num, %den
  ret i64 %result
}

define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX7-LABEL: s_mul_i96:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    v_mov_b32_e32 v0, s3
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    v_mov_b32_e32 v1, s4
; GFX7-NEXT:    v_mul_hi_u32 v1, s0, v1
; GFX7-NEXT:    s_mul_i32 s5, s0, s5
; GFX7-NEXT:    v_readfirstlane_b32 s7, v0
; GFX7-NEXT:    s_mul_i32 s8, s1, s4
; GFX7-NEXT:    v_mov_b32_e32 v0, s1
; GFX7-NEXT:    s_add_u32 s5, s8, s5
; GFX7-NEXT:    s_mul_i32 s2, s2, s3
; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s3
; GFX7-NEXT:    s_mul_i32 s6, s0, s3
; GFX7-NEXT:    s_add_u32 s2, s2, s5
; GFX7-NEXT:    s_mul_i32 s0, s0, s4
; GFX7-NEXT:    v_readfirstlane_b32 s4, v1
; GFX7-NEXT:    s_add_u32 s0, s0, s7
; GFX7-NEXT:    s_addc_u32 s2, s4, s2
; GFX7-NEXT:    s_mul_i32 s1, s1, s3
; GFX7-NEXT:    v_readfirstlane_b32 s3, v0
; GFX7-NEXT:    s_add_u32 s1, s1, s0
; GFX7-NEXT:    s_addc_u32 s2, s3, s2
; GFX7-NEXT:    s_mov_b32 s0, s6
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i96:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    v_mov_b32_e32 v0, s3
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NEXT:    v_mul_hi_u32 v1, s0, v1
; GFX8-NEXT:    s_mul_i32 s5, s0, s5
; GFX8-NEXT:    v_readfirstlane_b32 s7, v0
; GFX8-NEXT:    s_mul_i32 s8, s1, s4
; GFX8-NEXT:    v_mov_b32_e32 v0, s1
; GFX8-NEXT:    s_add_u32 s5, s8, s5
; GFX8-NEXT:    s_mul_i32 s2, s2, s3
; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s3
; GFX8-NEXT:    s_mul_i32 s6, s0, s3
; GFX8-NEXT:    s_add_u32 s2, s2, s5
; GFX8-NEXT:    s_mul_i32 s0, s0, s4
; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
; GFX8-NEXT:    s_add_u32 s0, s0, s7
; GFX8-NEXT:    s_addc_u32 s2, s4, s2
; GFX8-NEXT:    s_mul_i32 s1, s1, s3
; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
; GFX8-NEXT:    s_add_u32 s1, s1, s0
; GFX8-NEXT:    s_addc_u32 s2, s3, s2
; GFX8-NEXT:    s_mov_b32 s0, s6
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i96:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s5, s0, s5
; GFX9-NEXT:    s_mul_i32 s8, s1, s4
; GFX9-NEXT:    s_add_u32 s5, s8, s5
; GFX9-NEXT:    s_mul_i32 s2, s2, s3
; GFX9-NEXT:    s_mul_hi_u32 s7, s0, s3
; GFX9-NEXT:    s_add_u32 s2, s2, s5
; GFX9-NEXT:    s_mul_i32 s5, s0, s4
; GFX9-NEXT:    s_mul_i32 s6, s0, s3
; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s4
; GFX9-NEXT:    s_add_u32 s4, s5, s7
; GFX9-NEXT:    s_addc_u32 s0, s0, s2
; GFX9-NEXT:    s_mul_i32 s2, s1, s3
; GFX9-NEXT:    s_mul_hi_u32 s3, s1, s3
; GFX9-NEXT:    s_add_u32 s1, s2, s4
; GFX9-NEXT:    s_addc_u32 s2, s3, s0
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i96:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_i32 s6, s0, s5
; GFX10PLUS-NEXT:    s_mul_i32 s7, s1, s4
; GFX10PLUS-NEXT:    s_mul_i32 s2, s2, s3
; GFX10PLUS-NEXT:    s_add_i32 s6, s6, s7
; GFX10PLUS-NEXT:    s_mul_hi_u32 s7, s0, s3
; GFX10PLUS-NEXT:    s_add_i32 s6, s6, s2
; GFX10PLUS-NEXT:    s_mul_i32 s2, s0, s4
; GFX10PLUS-NEXT:    s_mul_i32 s5, s0, s3
; GFX10PLUS-NEXT:    s_mul_hi_u32 s0, s0, s4
; GFX10PLUS-NEXT:    s_add_u32 s2, s2, s7
; GFX10PLUS-NEXT:    s_mul_i32 s4, s1, s3
; GFX10PLUS-NEXT:    s_addc_u32 s0, s0, s6
; GFX10PLUS-NEXT:    s_mul_hi_u32 s3, s1, s3
; GFX10PLUS-NEXT:    s_add_u32 s1, s4, s2
; GFX10PLUS-NEXT:    s_addc_u32 s2, s3, s0
; GFX10PLUS-NEXT:    s_mov_b32 s0, s5
; GFX10PLUS-NEXT:    ; return to shader part epilog
  %result = mul i96 %num, %den
  %cast = bitcast i96 %result to <3 x i32>
  ret <3 x i32> %cast
}

define i96 @v_mul_i96(i96 %num, i96 %den) {
; GCN-LABEL: v_mul_i96:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v6, v0
; GCN-NEXT:    v_mov_b32_e32 v7, v1
; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1]
; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9]
; GCN-NEXT:    v_mov_b32_e32 v2, v8
; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
; GCN-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i96:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_mov_b32_e32 v6, v0
; GFX10-NEXT:    v_mov_b32_e32 v7, v1
; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v3
; GFX10-NEXT:    v_mul_lo_u32 v5, v6, v5
; GFX10-NEXT:    v_mul_lo_u32 v8, v7, v4
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v6, v3, 0
; GFX10-NEXT:    v_add3_u32 v2, v5, v8, v2
; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i96:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
; GFX11-NEXT:    v_mul_lo_u32 v2, v2, v3
; GFX11-NEXT:    v_mul_lo_u32 v5, v6, v5
; GFX11-NEXT:    v_mul_lo_u32 v8, v7, v4
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v3, 0
; GFX11-NEXT:    v_add3_u32 v2, v5, v8, v2
; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i96 %num, %den
  ret i96 %result
}

define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX7-LABEL: s_mul_i128:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    v_mov_b32_e32 v0, s4
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    v_mov_b32_e32 v1, s5
; GFX7-NEXT:    v_mul_hi_u32 v2, s1, v1
; GFX7-NEXT:    s_mul_i32 s10, s0, s6
; GFX7-NEXT:    v_readfirstlane_b32 s9, v0
; GFX7-NEXT:    v_mov_b32_e32 v0, s6
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    v_readfirstlane_b32 s13, v2
; GFX7-NEXT:    v_mov_b32_e32 v2, s2
; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s4
; GFX7-NEXT:    s_mul_i32 s12, s1, s5
; GFX7-NEXT:    v_readfirstlane_b32 s11, v0
; GFX7-NEXT:    s_add_u32 s10, s12, s10
; GFX7-NEXT:    v_mul_hi_u32 v1, s0, v1
; GFX7-NEXT:    v_mov_b32_e32 v0, s1
; GFX7-NEXT:    s_addc_u32 s11, s13, s11
; GFX7-NEXT:    s_mul_i32 s12, s2, s4
; GFX7-NEXT:    v_readfirstlane_b32 s13, v2
; GFX7-NEXT:    s_add_u32 s10, s12, s10
; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s4
; GFX7-NEXT:    s_addc_u32 s11, s13, s11
; GFX7-NEXT:    s_mul_i32 s12, s0, s5
; GFX7-NEXT:    v_readfirstlane_b32 s13, v1
; GFX7-NEXT:    s_add_u32 s9, s12, s9
; GFX7-NEXT:    s_addc_u32 s10, s13, s10
; GFX7-NEXT:    s_mul_i32 s13, s1, s4
; GFX7-NEXT:    s_cselect_b32 s12, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s14, v0
; GFX7-NEXT:    s_add_u32 s9, s13, s9
; GFX7-NEXT:    s_mul_i32 s8, s0, s4
; GFX7-NEXT:    s_addc_u32 s10, s14, s10
; GFX7-NEXT:    s_mul_i32 s0, s0, s7
; GFX7-NEXT:    s_addc_u32 s0, s11, s0
; GFX7-NEXT:    s_mul_i32 s1, s1, s6
; GFX7-NEXT:    s_cmp_lg_u32 s12, 0
; GFX7-NEXT:    s_addc_u32 s0, s0, s1
; GFX7-NEXT:    s_mul_i32 s2, s2, s5
; GFX7-NEXT:    s_add_u32 s0, s2, s0
; GFX7-NEXT:    s_mul_i32 s3, s3, s4
; GFX7-NEXT:    s_add_u32 s3, s3, s0
; GFX7-NEXT:    s_mov_b32 s0, s8
; GFX7-NEXT:    s_mov_b32 s1, s9
; GFX7-NEXT:    s_mov_b32 s2, s10
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i128:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    v_mov_b32_e32 v0, s4
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    v_mov_b32_e32 v1, s5
; GFX8-NEXT:    v_mul_hi_u32 v2, s1, v1
; GFX8-NEXT:    s_mul_i32 s10, s0, s6
; GFX8-NEXT:    v_readfirstlane_b32 s9, v0
; GFX8-NEXT:    v_mov_b32_e32 v0, s6
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    v_readfirstlane_b32 s13, v2
; GFX8-NEXT:    v_mov_b32_e32 v2, s2
; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s4
; GFX8-NEXT:    s_mul_i32 s12, s1, s5
; GFX8-NEXT:    v_readfirstlane_b32 s11, v0
; GFX8-NEXT:    s_add_u32 s10, s12, s10
; GFX8-NEXT:    v_mul_hi_u32 v1, s0, v1
; GFX8-NEXT:    v_mov_b32_e32 v0, s1
; GFX8-NEXT:    s_addc_u32 s11, s13, s11
; GFX8-NEXT:    s_mul_i32 s12, s2, s4
; GFX8-NEXT:    v_readfirstlane_b32 s13, v2
; GFX8-NEXT:    s_add_u32 s10, s12, s10
; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s4
; GFX8-NEXT:    s_addc_u32 s11, s13, s11
; GFX8-NEXT:    s_mul_i32 s12, s0, s5
; GFX8-NEXT:    v_readfirstlane_b32 s13, v1
; GFX8-NEXT:    s_add_u32 s9, s12, s9
; GFX8-NEXT:    s_addc_u32 s10, s13, s10
; GFX8-NEXT:    s_mul_i32 s13, s1, s4
; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s14, v0
; GFX8-NEXT:    s_add_u32 s9, s13, s9
; GFX8-NEXT:    s_mul_i32 s8, s0, s4
; GFX8-NEXT:    s_addc_u32 s10, s14, s10
; GFX8-NEXT:    s_mul_i32 s0, s0, s7
; GFX8-NEXT:    s_addc_u32 s0, s11, s0
; GFX8-NEXT:    s_mul_i32 s1, s1, s6
; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
; GFX8-NEXT:    s_addc_u32 s0, s0, s1
; GFX8-NEXT:    s_mul_i32 s2, s2, s5
; GFX8-NEXT:    s_add_u32 s0, s2, s0
; GFX8-NEXT:    s_mul_i32 s3, s3, s4
; GFX8-NEXT:    s_add_u32 s3, s3, s0
; GFX8-NEXT:    s_mov_b32 s0, s8
; GFX8-NEXT:    s_mov_b32 s1, s9
; GFX8-NEXT:    s_mov_b32 s2, s10
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s10, s0, s6
; GFX9-NEXT:    s_mul_i32 s12, s1, s5
; GFX9-NEXT:    s_mul_hi_u32 s11, s0, s6
; GFX9-NEXT:    s_mul_hi_u32 s13, s1, s5
; GFX9-NEXT:    s_add_u32 s10, s12, s10
; GFX9-NEXT:    s_addc_u32 s11, s13, s11
; GFX9-NEXT:    s_mul_i32 s12, s2, s4
; GFX9-NEXT:    s_mul_hi_u32 s13, s2, s4
; GFX9-NEXT:    s_add_u32 s10, s12, s10
; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s4
; GFX9-NEXT:    s_addc_u32 s11, s13, s11
; GFX9-NEXT:    s_mul_i32 s12, s0, s5
; GFX9-NEXT:    s_mul_hi_u32 s13, s0, s5
; GFX9-NEXT:    s_add_u32 s9, s12, s9
; GFX9-NEXT:    s_addc_u32 s10, s13, s10
; GFX9-NEXT:    s_mul_i32 s13, s1, s4
; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s14, s1, s4
; GFX9-NEXT:    s_add_u32 s9, s13, s9
; GFX9-NEXT:    s_mul_i32 s8, s0, s4
; GFX9-NEXT:    s_addc_u32 s10, s14, s10
; GFX9-NEXT:    s_mul_i32 s0, s0, s7
; GFX9-NEXT:    s_addc_u32 s0, s11, s0
; GFX9-NEXT:    s_mul_i32 s1, s1, s6
; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
; GFX9-NEXT:    s_addc_u32 s0, s0, s1
; GFX9-NEXT:    s_mul_i32 s2, s2, s5
; GFX9-NEXT:    s_add_u32 s0, s2, s0
; GFX9-NEXT:    s_mul_i32 s3, s3, s4
; GFX9-NEXT:    s_add_u32 s3, s3, s0
; GFX9-NEXT:    s_mov_b32 s0, s8
; GFX9-NEXT:    s_mov_b32 s1, s9
; GFX9-NEXT:    s_mov_b32 s2, s10
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i128:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_i32 s9, s0, s6
; GFX10PLUS-NEXT:    s_mul_i32 s11, s1, s5
; GFX10PLUS-NEXT:    s_mul_hi_u32 s10, s0, s6
; GFX10PLUS-NEXT:    s_mul_hi_u32 s12, s1, s5
; GFX10PLUS-NEXT:    s_add_u32 s9, s11, s9
; GFX10PLUS-NEXT:    s_mul_i32 s11, s2, s4
; GFX10PLUS-NEXT:    s_addc_u32 s10, s12, s10
; GFX10PLUS-NEXT:    s_mul_hi_u32 s12, s2, s4
; GFX10PLUS-NEXT:    s_mul_hi_u32 s8, s0, s4
; GFX10PLUS-NEXT:    s_add_u32 s9, s11, s9
; GFX10PLUS-NEXT:    s_mul_i32 s11, s0, s5
; GFX10PLUS-NEXT:    s_addc_u32 s10, s12, s10
; GFX10PLUS-NEXT:    s_mul_hi_u32 s12, s0, s5
; GFX10PLUS-NEXT:    s_add_u32 s8, s11, s8
; GFX10PLUS-NEXT:    s_addc_u32 s9, s12, s9
; GFX10PLUS-NEXT:    s_mul_i32 s12, s1, s4
; GFX10PLUS-NEXT:    s_mul_hi_u32 s13, s1, s4
; GFX10PLUS-NEXT:    s_cselect_b32 s11, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s8, s12, s8
; GFX10PLUS-NEXT:    s_mul_i32 s12, s0, s7
; GFX10PLUS-NEXT:    s_addc_u32 s7, s13, s9
; GFX10PLUS-NEXT:    s_addc_u32 s9, s10, s12
; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s6
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
; GFX10PLUS-NEXT:    s_mul_i32 s2, s2, s5
; GFX10PLUS-NEXT:    s_addc_u32 s1, s9, s1
; GFX10PLUS-NEXT:    s_mul_i32 s3, s3, s4
; GFX10PLUS-NEXT:    s_add_i32 s1, s1, s2
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s4
; GFX10PLUS-NEXT:    s_add_i32 s3, s1, s3
; GFX10PLUS-NEXT:    s_mov_b32 s1, s8
; GFX10PLUS-NEXT:    s_mov_b32 s2, s7
; GFX10PLUS-NEXT:    ; return to shader part epilog
  %result = mul i128 %num, %den
  %cast = bitcast i128 %result to <4 x i32>
  ret <4 x i32> %cast
}

define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX7-LABEL: v_mul_i128:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_mov_b32_e32 v8, v0
; GFX7-NEXT:    v_mov_b32_e32 v9, v1
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX7-NEXT:    v_mov_b32_e32 v10, v2
; GFX7-NEXT:    v_mul_lo_u32 v7, v8, v7
; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX7-NEXT:    v_mul_lo_u32 v6, v9, v6
; GFX7-NEXT:    v_mov_b32_e32 v2, v11
; GFX7-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
; GFX7-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i128:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mov_b32_e32 v8, v0
; GFX8-NEXT:    v_mov_b32_e32 v9, v1
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX8-NEXT:    v_mov_b32_e32 v10, v2
; GFX8-NEXT:    v_mul_lo_u32 v7, v8, v7
; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX8-NEXT:    v_mul_lo_u32 v6, v9, v6
; GFX8-NEXT:    v_mov_b32_e32 v2, v11
; GFX8-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
; GFX8-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v8, v0
; GFX9-NEXT:    v_mov_b32_e32 v9, v1
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX9-NEXT:    v_mov_b32_e32 v10, v2
; GFX9-NEXT:    v_mul_lo_u32 v7, v8, v7
; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX9-NEXT:    v_mul_lo_u32 v6, v9, v6
; GFX9-NEXT:    v_mov_b32_e32 v2, v11
; GFX9-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5]
; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_mov_b32_e32 v8, v0
; GFX10-NEXT:    v_mov_b32_e32 v9, v1
; GFX10-NEXT:    v_mov_b32_e32 v10, v2
; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v4
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v8, v6, 0
; GFX10-NEXT:    v_mul_lo_u32 v7, v8, v7
; GFX10-NEXT:    v_mul_lo_u32 v6, v9, v6
; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v8, v4, 0
; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
; GFX10-NEXT:    v_mov_b32_e32 v2, v11
; GFX10-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
; GFX10-NEXT:    v_mul_lo_u32 v5, v10, v5
; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
; GFX10-NEXT:    v_add3_u32 v3, v4, v5, v3
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
; GFX11-NEXT:    v_mov_b32_e32 v10, v2
; GFX11-NEXT:    v_mul_lo_u32 v3, v3, v4
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v6, 0
; GFX11-NEXT:    v_mul_lo_u32 v7, v8, v7
; GFX11-NEXT:    v_mul_lo_u32 v6, v9, v6
; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v9, v5, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v4, 0
; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v10, v4, v[11:12]
; GFX11-NEXT:    v_mov_b32_e32 v2, v11
; GFX11-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
; GFX11-NEXT:    v_mul_lo_u32 v5, v10, v5
; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, v9, v4, v[1:2]
; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
; GFX11-NEXT:    v_add3_u32 v3, v4, v5, v3
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i128 %num, %den
  ret i128 %result
}

define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-LABEL: s_mul_i256:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mov_b32 s16, s0
; GFX7-NEXT:    v_mov_b32_e32 v0, s8
; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
; GFX7-NEXT:    v_mov_b32_e32 v1, s9
; GFX7-NEXT:    v_mul_hi_u32 v2, s1, v1
; GFX7-NEXT:    v_mul_hi_u32 v1, s16, v1
; GFX7-NEXT:    v_readfirstlane_b32 s17, v0
; GFX7-NEXT:    v_mov_b32_e32 v0, s10
; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
; GFX7-NEXT:    v_readfirstlane_b32 s21, v2
; GFX7-NEXT:    v_mov_b32_e32 v2, s2
; GFX7-NEXT:    v_readfirstlane_b32 s23, v1
; GFX7-NEXT:    v_readfirstlane_b32 s19, v0
; GFX7-NEXT:    v_mov_b32_e32 v0, s1
; GFX7-NEXT:    v_mul_hi_u32 v1, v0, s8
; GFX7-NEXT:    v_mul_hi_u32 v3, v2, s8
; GFX7-NEXT:    v_mov_b32_e32 v4, s11
; GFX7-NEXT:    s_mul_i32 s18, s16, s10
; GFX7-NEXT:    v_readfirstlane_b32 s24, v1
; GFX7-NEXT:    v_mov_b32_e32 v1, s12
; GFX7-NEXT:    v_readfirstlane_b32 s22, v3
; GFX7-NEXT:    v_mul_hi_u32 v3, s16, v1
; GFX7-NEXT:    s_mul_i32 s20, s1, s9
; GFX7-NEXT:    v_mul_hi_u32 v5, s1, v4
; GFX7-NEXT:    s_add_u32 s18, s20, s18
; GFX7-NEXT:    v_readfirstlane_b32 s25, v3
; GFX7-NEXT:    v_mul_hi_u32 v3, v2, s10
; GFX7-NEXT:    s_addc_u32 s19, s21, s19
; GFX7-NEXT:    s_mul_i32 s21, s2, s8
; GFX7-NEXT:    s_cselect_b32 s20, 1, 0
; GFX7-NEXT:    s_add_u32 s18, s21, s18
; GFX7-NEXT:    v_readfirstlane_b32 s28, v3
; GFX7-NEXT:    v_mov_b32_e32 v3, s3
; GFX7-NEXT:    s_addc_u32 s19, s22, s19
; GFX7-NEXT:    s_mul_i32 s22, s16, s9
; GFX7-NEXT:    v_readfirstlane_b32 s27, v5
; GFX7-NEXT:    v_mul_hi_u32 v5, v3, s9
; GFX7-NEXT:    s_cselect_b32 s21, 1, 0
; GFX7-NEXT:    s_add_u32 s17, s22, s17
; GFX7-NEXT:    s_addc_u32 s18, s23, s18
; GFX7-NEXT:    s_mul_i32 s23, s1, s8
; GFX7-NEXT:    s_cselect_b32 s22, 1, 0
; GFX7-NEXT:    s_add_u32 s17, s23, s17
; GFX7-NEXT:    s_addc_u32 s18, s24, s18
; GFX7-NEXT:    s_mul_i32 s24, s16, s12
; GFX7-NEXT:    s_mul_i32 s26, s1, s11
; GFX7-NEXT:    v_readfirstlane_b32 s29, v5
; GFX7-NEXT:    v_mov_b32_e32 v5, s4
; GFX7-NEXT:    s_cselect_b32 s23, 1, 0
; GFX7-NEXT:    s_add_u32 s24, s26, s24
; GFX7-NEXT:    v_mul_hi_u32 v6, v5, s8
; GFX7-NEXT:    s_addc_u32 s25, s27, s25
; GFX7-NEXT:    s_mul_i32 s27, s2, s10
; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
; GFX7-NEXT:    s_add_u32 s24, s27, s24
; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s10
; GFX7-NEXT:    s_addc_u32 s25, s28, s25
; GFX7-NEXT:    s_mul_i32 s28, s3, s9
; GFX7-NEXT:    s_cselect_b32 s27, 1, 0
; GFX7-NEXT:    s_add_u32 s24, s28, s24
; GFX7-NEXT:    v_readfirstlane_b32 s30, v6
; GFX7-NEXT:    v_mul_hi_u32 v6, s16, v4
; GFX7-NEXT:    s_addc_u32 s25, s29, s25
; GFX7-NEXT:    s_mul_i32 s29, s4, s8
; GFX7-NEXT:    s_cselect_b32 s28, 1, 0
; GFX7-NEXT:    s_add_u32 s24, s29, s24
; GFX7-NEXT:    v_readfirstlane_b32 s33, v0
; GFX7-NEXT:    v_mul_hi_u32 v0, v2, s9
; GFX7-NEXT:    s_addc_u32 s25, s30, s25
; GFX7-NEXT:    s_mul_i32 s30, s16, s11
; GFX7-NEXT:    s_cselect_b32 s29, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s31, v6
; GFX7-NEXT:    s_add_u32 s19, s30, s19
; GFX7-NEXT:    s_addc_u32 s24, s31, s24
; GFX7-NEXT:    s_mul_i32 s31, s1, s10
; GFX7-NEXT:    s_cselect_b32 s30, 1, 0
; GFX7-NEXT:    s_add_u32 s19, s31, s19
; GFX7-NEXT:    v_readfirstlane_b32 s34, v0
; GFX7-NEXT:    v_mul_hi_u32 v0, v3, s8
; GFX7-NEXT:    s_addc_u32 s24, s33, s24
; GFX7-NEXT:    s_mul_i32 s33, s2, s9
; GFX7-NEXT:    s_cselect_b32 s31, 1, 0
; GFX7-NEXT:    s_add_u32 s19, s33, s19
; GFX7-NEXT:    s_addc_u32 s24, s34, s24
; GFX7-NEXT:    s_mul_i32 s34, s3, s8
; GFX7-NEXT:    s_cselect_b32 s33, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
; GFX7-NEXT:    s_add_u32 s19, s34, s19
; GFX7-NEXT:    v_mov_b32_e32 v0, s14
; GFX7-NEXT:    s_addc_u32 s24, s35, s24
; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
; GFX7-NEXT:    s_cselect_b32 s34, 1, 0
; GFX7-NEXT:    s_cmp_lg_u32 s23, 0
; GFX7-NEXT:    s_addc_u32 s19, s22, s19
; GFX7-NEXT:    v_mov_b32_e32 v2, s13
; GFX7-NEXT:    s_cselect_b32 s22, 1, 0
; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
; GFX7-NEXT:    v_mul_hi_u32 v6, s1, v2
; GFX7-NEXT:    s_addc_u32 s20, s20, 0
; GFX7-NEXT:    v_readfirstlane_b32 s23, v0
; GFX7-NEXT:    v_mul_hi_u32 v0, s2, v1
; GFX7-NEXT:    s_cmp_lg_u32 s22, 0
; GFX7-NEXT:    s_addc_u32 s20, s20, s24
; GFX7-NEXT:    s_mul_i32 s22, s16, s14
; GFX7-NEXT:    s_mul_i32 s24, s1, s13
; GFX7-NEXT:    s_cselect_b32 s21, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
; GFX7-NEXT:    s_add_u32 s22, s24, s22
; GFX7-NEXT:    s_addc_u32 s23, s35, s23
; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
; GFX7-NEXT:    v_mul_hi_u32 v0, v3, s11
; GFX7-NEXT:    s_mul_i32 s24, s2, s12
; GFX7-NEXT:    s_add_u32 s22, s24, s22
; GFX7-NEXT:    s_addc_u32 s23, s35, s23
; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
; GFX7-NEXT:    v_mul_hi_u32 v0, v5, s10
; GFX7-NEXT:    s_mul_i32 s24, s3, s11
; GFX7-NEXT:    s_add_u32 s22, s24, s22
; GFX7-NEXT:    s_addc_u32 s23, s35, s23
; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
; GFX7-NEXT:    v_mov_b32_e32 v0, s5
; GFX7-NEXT:    v_mul_hi_u32 v6, v0, s9
; GFX7-NEXT:    s_mul_i32 s24, s4, s10
; GFX7-NEXT:    s_add_u32 s22, s24, s22
; GFX7-NEXT:    v_mul_hi_u32 v1, s1, v1
; GFX7-NEXT:    s_addc_u32 s23, s35, s23
; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
; GFX7-NEXT:    v_mov_b32_e32 v6, s6
; GFX7-NEXT:    v_mul_hi_u32 v6, v6, s8
; GFX7-NEXT:    s_mul_i32 s24, s5, s9
; GFX7-NEXT:    s_add_u32 s22, s24, s22
; GFX7-NEXT:    v_mul_hi_u32 v2, s16, v2
; GFX7-NEXT:    v_readfirstlane_b32 s36, v1
; GFX7-NEXT:    v_mul_hi_u32 v1, s2, v4
; GFX7-NEXT:    s_addc_u32 s23, s35, s23
; GFX7-NEXT:    s_mul_i32 s24, s6, s8
; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
; GFX7-NEXT:    s_add_u32 s22, s24, s22
; GFX7-NEXT:    s_addc_u32 s23, s35, s23
; GFX7-NEXT:    s_mul_i32 s24, s16, s13
; GFX7-NEXT:    v_readfirstlane_b32 s35, v2
; GFX7-NEXT:    s_add_u32 s24, s24, s25
; GFX7-NEXT:    v_readfirstlane_b32 s37, v1
; GFX7-NEXT:    v_mul_hi_u32 v1, v3, s10
; GFX7-NEXT:    s_addc_u32 s22, s35, s22
; GFX7-NEXT:    s_mul_i32 s35, s1, s12
; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
; GFX7-NEXT:    s_add_u32 s24, s35, s24
; GFX7-NEXT:    s_addc_u32 s22, s36, s22
; GFX7-NEXT:    s_mul_i32 s36, s2, s11
; GFX7-NEXT:    s_cselect_b32 s35, 1, 0
; GFX7-NEXT:    s_add_u32 s24, s36, s24
; GFX7-NEXT:    v_readfirstlane_b32 s38, v1
; GFX7-NEXT:    v_mul_hi_u32 v1, v5, s9
; GFX7-NEXT:    s_addc_u32 s22, s37, s22
; GFX7-NEXT:    s_mul_i32 s37, s3, s10
; GFX7-NEXT:    s_cselect_b32 s36, 1, 0
; GFX7-NEXT:    s_add_u32 s24, s37, s24
; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s8
; GFX7-NEXT:    s_addc_u32 s22, s38, s22
; GFX7-NEXT:    s_mul_i32 s38, s4, s9
; GFX7-NEXT:    s_cselect_b32 s37, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s39, v1
; GFX7-NEXT:    s_add_u32 s24, s38, s24
; GFX7-NEXT:    s_addc_u32 s22, s39, s22
; GFX7-NEXT:    s_mul_i32 s39, s5, s8
; GFX7-NEXT:    s_cselect_b32 s38, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s40, v0
; GFX7-NEXT:    s_add_u32 s24, s39, s24
; GFX7-NEXT:    s_addc_u32 s22, s40, s22
; GFX7-NEXT:    s_cselect_b32 s39, 1, 0
; GFX7-NEXT:    s_cmp_lg_u32 s31, 0
; GFX7-NEXT:    s_addc_u32 s30, s30, 0
; GFX7-NEXT:    s_cmp_lg_u32 s33, 0
; GFX7-NEXT:    s_addc_u32 s30, s30, 0
; GFX7-NEXT:    s_cmp_lg_u32 s34, 0
; GFX7-NEXT:    s_addc_u32 s30, s30, 0
; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
; GFX7-NEXT:    s_addc_u32 s21, s30, s24
; GFX7-NEXT:    s_cselect_b32 s24, 1, 0
; GFX7-NEXT:    s_cmp_lg_u32 s27, 0
; GFX7-NEXT:    s_addc_u32 s26, s26, 0
; GFX7-NEXT:    s_cmp_lg_u32 s28, 0
; GFX7-NEXT:    s_addc_u32 s26, s26, 0
; GFX7-NEXT:    s_cmp_lg_u32 s29, 0
; GFX7-NEXT:    s_addc_u32 s26, s26, 0
; GFX7-NEXT:    s_cmp_lg_u32 s24, 0
; GFX7-NEXT:    s_addc_u32 s22, s26, s22
; GFX7-NEXT:    s_mul_i32 s16, s16, s15
; GFX7-NEXT:    s_addc_u32 s15, s23, s16
; GFX7-NEXT:    s_mul_i32 s1, s1, s14
; GFX7-NEXT:    s_cmp_lg_u32 s39, 0
; GFX7-NEXT:    s_addc_u32 s1, s15, s1
; GFX7-NEXT:    s_mul_i32 s2, s2, s13
; GFX7-NEXT:    s_cmp_lg_u32 s38, 0
; GFX7-NEXT:    s_addc_u32 s1, s1, s2
; GFX7-NEXT:    s_mul_i32 s3, s3, s12
; GFX7-NEXT:    s_cmp_lg_u32 s37, 0
; GFX7-NEXT:    s_addc_u32 s1, s1, s3
; GFX7-NEXT:    s_mul_i32 s4, s4, s11
; GFX7-NEXT:    s_cmp_lg_u32 s36, 0
; GFX7-NEXT:    s_addc_u32 s1, s1, s4
; GFX7-NEXT:    s_mul_i32 s5, s5, s10
; GFX7-NEXT:    s_cmp_lg_u32 s35, 0
; GFX7-NEXT:    s_addc_u32 s1, s1, s5
; GFX7-NEXT:    s_mul_i32 s6, s6, s9
; GFX7-NEXT:    s_cmp_lg_u32 s25, 0
; GFX7-NEXT:    s_addc_u32 s1, s1, s6
; GFX7-NEXT:    s_mul_i32 s7, s7, s8
; GFX7-NEXT:    s_mul_i32 s0, s0, s8
; GFX7-NEXT:    s_add_u32 s7, s7, s1
; GFX7-NEXT:    s_mov_b32 s1, s17
; GFX7-NEXT:    s_mov_b32 s2, s18
; GFX7-NEXT:    s_mov_b32 s3, s19
; GFX7-NEXT:    s_mov_b32 s4, s20
; GFX7-NEXT:    s_mov_b32 s5, s21
; GFX7-NEXT:    s_mov_b32 s6, s22
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i256:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_mov_b32 s16, s0
; GFX8-NEXT:    v_mov_b32_e32 v0, s8
; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
; GFX8-NEXT:    v_mov_b32_e32 v1, s9
; GFX8-NEXT:    v_mul_hi_u32 v2, s1, v1
; GFX8-NEXT:    v_mul_hi_u32 v1, s16, v1
; GFX8-NEXT:    v_readfirstlane_b32 s17, v0
; GFX8-NEXT:    v_mov_b32_e32 v0, s10
; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
; GFX8-NEXT:    v_readfirstlane_b32 s21, v2
; GFX8-NEXT:    v_mov_b32_e32 v2, s2
; GFX8-NEXT:    v_readfirstlane_b32 s23, v1
; GFX8-NEXT:    v_readfirstlane_b32 s19, v0
; GFX8-NEXT:    v_mov_b32_e32 v0, s1
; GFX8-NEXT:    v_mul_hi_u32 v1, v0, s8
; GFX8-NEXT:    v_mul_hi_u32 v3, v2, s8
; GFX8-NEXT:    v_mov_b32_e32 v4, s11
; GFX8-NEXT:    s_mul_i32 s18, s16, s10
; GFX8-NEXT:    v_readfirstlane_b32 s24, v1
; GFX8-NEXT:    v_mov_b32_e32 v1, s12
; GFX8-NEXT:    v_readfirstlane_b32 s22, v3
; GFX8-NEXT:    v_mul_hi_u32 v3, s16, v1
; GFX8-NEXT:    s_mul_i32 s20, s1, s9
; GFX8-NEXT:    v_mul_hi_u32 v5, s1, v4
; GFX8-NEXT:    s_add_u32 s18, s20, s18
; GFX8-NEXT:    v_readfirstlane_b32 s25, v3
; GFX8-NEXT:    v_mul_hi_u32 v3, v2, s10
; GFX8-NEXT:    s_addc_u32 s19, s21, s19
; GFX8-NEXT:    s_mul_i32 s21, s2, s8
; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
; GFX8-NEXT:    s_add_u32 s18, s21, s18
; GFX8-NEXT:    v_readfirstlane_b32 s28, v3
; GFX8-NEXT:    v_mov_b32_e32 v3, s3
; GFX8-NEXT:    s_addc_u32 s19, s22, s19
; GFX8-NEXT:    s_mul_i32 s22, s16, s9
; GFX8-NEXT:    v_readfirstlane_b32 s27, v5
; GFX8-NEXT:    v_mul_hi_u32 v5, v3, s9
; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
; GFX8-NEXT:    s_add_u32 s17, s22, s17
; GFX8-NEXT:    s_addc_u32 s18, s23, s18
; GFX8-NEXT:    s_mul_i32 s23, s1, s8
; GFX8-NEXT:    s_cselect_b32 s22, 1, 0
; GFX8-NEXT:    s_add_u32 s17, s23, s17
; GFX8-NEXT:    s_addc_u32 s18, s24, s18
; GFX8-NEXT:    s_mul_i32 s24, s16, s12
; GFX8-NEXT:    s_mul_i32 s26, s1, s11
; GFX8-NEXT:    v_readfirstlane_b32 s29, v5
; GFX8-NEXT:    v_mov_b32_e32 v5, s4
; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
; GFX8-NEXT:    s_add_u32 s24, s26, s24
; GFX8-NEXT:    v_mul_hi_u32 v6, v5, s8
; GFX8-NEXT:    s_addc_u32 s25, s27, s25
; GFX8-NEXT:    s_mul_i32 s27, s2, s10
; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
; GFX8-NEXT:    s_add_u32 s24, s27, s24
; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s10
; GFX8-NEXT:    s_addc_u32 s25, s28, s25
; GFX8-NEXT:    s_mul_i32 s28, s3, s9
; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
; GFX8-NEXT:    s_add_u32 s24, s28, s24
; GFX8-NEXT:    v_readfirstlane_b32 s30, v6
; GFX8-NEXT:    v_mul_hi_u32 v6, s16, v4
; GFX8-NEXT:    s_addc_u32 s25, s29, s25
; GFX8-NEXT:    s_mul_i32 s29, s4, s8
; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
; GFX8-NEXT:    s_add_u32 s24, s29, s24
; GFX8-NEXT:    v_readfirstlane_b32 s33, v0
; GFX8-NEXT:    v_mul_hi_u32 v0, v2, s9
; GFX8-NEXT:    s_addc_u32 s25, s30, s25
; GFX8-NEXT:    s_mul_i32 s30, s16, s11
; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s31, v6
; GFX8-NEXT:    s_add_u32 s19, s30, s19
; GFX8-NEXT:    s_addc_u32 s24, s31, s24
; GFX8-NEXT:    s_mul_i32 s31, s1, s10
; GFX8-NEXT:    s_cselect_b32 s30, 1, 0
; GFX8-NEXT:    s_add_u32 s19, s31, s19
; GFX8-NEXT:    v_readfirstlane_b32 s34, v0
; GFX8-NEXT:    v_mul_hi_u32 v0, v3, s8
; GFX8-NEXT:    s_addc_u32 s24, s33, s24
; GFX8-NEXT:    s_mul_i32 s33, s2, s9
; GFX8-NEXT:    s_cselect_b32 s31, 1, 0
; GFX8-NEXT:    s_add_u32 s19, s33, s19
; GFX8-NEXT:    s_addc_u32 s24, s34, s24
; GFX8-NEXT:    s_mul_i32 s34, s3, s8
; GFX8-NEXT:    s_cselect_b32 s33, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
; GFX8-NEXT:    s_add_u32 s19, s34, s19
; GFX8-NEXT:    v_mov_b32_e32 v0, s14
; GFX8-NEXT:    s_addc_u32 s24, s35, s24
; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
; GFX8-NEXT:    s_cselect_b32 s34, 1, 0
; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
; GFX8-NEXT:    s_addc_u32 s19, s22, s19
; GFX8-NEXT:    v_mov_b32_e32 v2, s13
; GFX8-NEXT:    s_cselect_b32 s22, 1, 0
; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
; GFX8-NEXT:    v_mul_hi_u32 v6, s1, v2
; GFX8-NEXT:    s_addc_u32 s20, s20, 0
; GFX8-NEXT:    v_readfirstlane_b32 s23, v0
; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v1
; GFX8-NEXT:    s_cmp_lg_u32 s22, 0
; GFX8-NEXT:    s_addc_u32 s20, s20, s24
; GFX8-NEXT:    s_mul_i32 s22, s16, s14
; GFX8-NEXT:    s_mul_i32 s24, s1, s13
; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
; GFX8-NEXT:    s_add_u32 s22, s24, s22
; GFX8-NEXT:    s_addc_u32 s23, s35, s23
; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
; GFX8-NEXT:    v_mul_hi_u32 v0, v3, s11
; GFX8-NEXT:    s_mul_i32 s24, s2, s12
; GFX8-NEXT:    s_add_u32 s22, s24, s22
; GFX8-NEXT:    s_addc_u32 s23, s35, s23
; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
; GFX8-NEXT:    v_mul_hi_u32 v0, v5, s10
; GFX8-NEXT:    s_mul_i32 s24, s3, s11
; GFX8-NEXT:    s_add_u32 s22, s24, s22
; GFX8-NEXT:    s_addc_u32 s23, s35, s23
; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
; GFX8-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NEXT:    v_mul_hi_u32 v6, v0, s9
; GFX8-NEXT:    s_mul_i32 s24, s4, s10
; GFX8-NEXT:    s_add_u32 s22, s24, s22
; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
; GFX8-NEXT:    s_addc_u32 s23, s35, s23
; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
; GFX8-NEXT:    v_mov_b32_e32 v6, s6
; GFX8-NEXT:    v_mul_hi_u32 v6, v6, s8
; GFX8-NEXT:    s_mul_i32 s24, s5, s9
; GFX8-NEXT:    s_add_u32 s22, s24, s22
; GFX8-NEXT:    v_mul_hi_u32 v2, s16, v2
; GFX8-NEXT:    v_readfirstlane_b32 s36, v1
; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v4
; GFX8-NEXT:    s_addc_u32 s23, s35, s23
; GFX8-NEXT:    s_mul_i32 s24, s6, s8
; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
; GFX8-NEXT:    s_add_u32 s22, s24, s22
; GFX8-NEXT:    s_addc_u32 s23, s35, s23
; GFX8-NEXT:    s_mul_i32 s24, s16, s13
; GFX8-NEXT:    v_readfirstlane_b32 s35, v2
; GFX8-NEXT:    s_add_u32 s24, s24, s25
; GFX8-NEXT:    v_readfirstlane_b32 s37, v1
; GFX8-NEXT:    v_mul_hi_u32 v1, v3, s10
; GFX8-NEXT:    s_addc_u32 s22, s35, s22
; GFX8-NEXT:    s_mul_i32 s35, s1, s12
; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
; GFX8-NEXT:    s_add_u32 s24, s35, s24
; GFX8-NEXT:    s_addc_u32 s22, s36, s22
; GFX8-NEXT:    s_mul_i32 s36, s2, s11
; GFX8-NEXT:    s_cselect_b32 s35, 1, 0
; GFX8-NEXT:    s_add_u32 s24, s36, s24
; GFX8-NEXT:    v_readfirstlane_b32 s38, v1
; GFX8-NEXT:    v_mul_hi_u32 v1, v5, s9
; GFX8-NEXT:    s_addc_u32 s22, s37, s22
; GFX8-NEXT:    s_mul_i32 s37, s3, s10
; GFX8-NEXT:    s_cselect_b32 s36, 1, 0
; GFX8-NEXT:    s_add_u32 s24, s37, s24
; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s8
; GFX8-NEXT:    s_addc_u32 s22, s38, s22
; GFX8-NEXT:    s_mul_i32 s38, s4, s9
; GFX8-NEXT:    s_cselect_b32 s37, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s39, v1
; GFX8-NEXT:    s_add_u32 s24, s38, s24
; GFX8-NEXT:    s_addc_u32 s22, s39, s22
; GFX8-NEXT:    s_mul_i32 s39, s5, s8
; GFX8-NEXT:    s_cselect_b32 s38, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s40, v0
; GFX8-NEXT:    s_add_u32 s24, s39, s24
; GFX8-NEXT:    s_addc_u32 s22, s40, s22
; GFX8-NEXT:    s_cselect_b32 s39, 1, 0
; GFX8-NEXT:    s_cmp_lg_u32 s31, 0
; GFX8-NEXT:    s_addc_u32 s30, s30, 0
; GFX8-NEXT:    s_cmp_lg_u32 s33, 0
; GFX8-NEXT:    s_addc_u32 s30, s30, 0
; GFX8-NEXT:    s_cmp_lg_u32 s34, 0
; GFX8-NEXT:    s_addc_u32 s30, s30, 0
; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
; GFX8-NEXT:    s_addc_u32 s21, s30, s24
; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
; GFX8-NEXT:    s_addc_u32 s26, s26, 0
; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
; GFX8-NEXT:    s_addc_u32 s26, s26, 0
; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
; GFX8-NEXT:    s_addc_u32 s26, s26, 0
; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
; GFX8-NEXT:    s_addc_u32 s22, s26, s22
; GFX8-NEXT:    s_mul_i32 s16, s16, s15
; GFX8-NEXT:    s_addc_u32 s15, s23, s16
; GFX8-NEXT:    s_mul_i32 s1, s1, s14
; GFX8-NEXT:    s_cmp_lg_u32 s39, 0
; GFX8-NEXT:    s_addc_u32 s1, s15, s1
; GFX8-NEXT:    s_mul_i32 s2, s2, s13
; GFX8-NEXT:    s_cmp_lg_u32 s38, 0
; GFX8-NEXT:    s_addc_u32 s1, s1, s2
; GFX8-NEXT:    s_mul_i32 s3, s3, s12
; GFX8-NEXT:    s_cmp_lg_u32 s37, 0
; GFX8-NEXT:    s_addc_u32 s1, s1, s3
; GFX8-NEXT:    s_mul_i32 s4, s4, s11
; GFX8-NEXT:    s_cmp_lg_u32 s36, 0
; GFX8-NEXT:    s_addc_u32 s1, s1, s4
; GFX8-NEXT:    s_mul_i32 s5, s5, s10
; GFX8-NEXT:    s_cmp_lg_u32 s35, 0
; GFX8-NEXT:    s_addc_u32 s1, s1, s5
; GFX8-NEXT:    s_mul_i32 s6, s6, s9
; GFX8-NEXT:    s_cmp_lg_u32 s25, 0
; GFX8-NEXT:    s_addc_u32 s1, s1, s6
; GFX8-NEXT:    s_mul_i32 s7, s7, s8
; GFX8-NEXT:    s_mul_i32 s0, s0, s8
; GFX8-NEXT:    s_add_u32 s7, s7, s1
; GFX8-NEXT:    s_mov_b32 s1, s17
; GFX8-NEXT:    s_mov_b32 s2, s18
; GFX8-NEXT:    s_mov_b32 s3, s19
; GFX8-NEXT:    s_mov_b32 s4, s20
; GFX8-NEXT:    s_mov_b32 s5, s21
; GFX8-NEXT:    s_mov_b32 s6, s22
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i256:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s18, s0, s10
; GFX9-NEXT:    s_mul_i32 s20, s1, s9
; GFX9-NEXT:    s_mul_hi_u32 s19, s0, s10
; GFX9-NEXT:    s_mul_hi_u32 s21, s1, s9
; GFX9-NEXT:    s_add_u32 s18, s20, s18
; GFX9-NEXT:    s_addc_u32 s19, s21, s19
; GFX9-NEXT:    s_mul_i32 s21, s2, s8
; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s22, s2, s8
; GFX9-NEXT:    s_add_u32 s18, s21, s18
; GFX9-NEXT:    s_mul_hi_u32 s17, s0, s8
; GFX9-NEXT:    s_addc_u32 s19, s22, s19
; GFX9-NEXT:    s_mul_i32 s22, s0, s9
; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s23, s0, s9
; GFX9-NEXT:    s_add_u32 s17, s22, s17
; GFX9-NEXT:    s_addc_u32 s18, s23, s18
; GFX9-NEXT:    s_mul_i32 s23, s1, s8
; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s24, s1, s8
; GFX9-NEXT:    s_add_u32 s17, s23, s17
; GFX9-NEXT:    s_addc_u32 s18, s24, s18
; GFX9-NEXT:    s_mul_i32 s24, s0, s12
; GFX9-NEXT:    s_mul_i32 s26, s1, s11
; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s25, s0, s12
; GFX9-NEXT:    s_mul_hi_u32 s27, s1, s11
; GFX9-NEXT:    s_add_u32 s24, s26, s24
; GFX9-NEXT:    s_addc_u32 s25, s27, s25
; GFX9-NEXT:    s_mul_i32 s27, s2, s10
; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s28, s2, s10
; GFX9-NEXT:    s_add_u32 s24, s27, s24
; GFX9-NEXT:    s_addc_u32 s25, s28, s25
; GFX9-NEXT:    s_mul_i32 s28, s3, s9
; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s29, s3, s9
; GFX9-NEXT:    s_add_u32 s24, s28, s24
; GFX9-NEXT:    s_addc_u32 s25, s29, s25
; GFX9-NEXT:    s_mul_i32 s29, s4, s8
; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s30, s4, s8
; GFX9-NEXT:    s_add_u32 s24, s29, s24
; GFX9-NEXT:    s_addc_u32 s25, s30, s25
; GFX9-NEXT:    s_mul_i32 s30, s0, s11
; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s31, s0, s11
; GFX9-NEXT:    s_add_u32 s19, s30, s19
; GFX9-NEXT:    s_addc_u32 s24, s31, s24
; GFX9-NEXT:    s_mul_i32 s31, s1, s10
; GFX9-NEXT:    s_cselect_b32 s30, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s33, s1, s10
; GFX9-NEXT:    s_add_u32 s19, s31, s19
; GFX9-NEXT:    s_addc_u32 s24, s33, s24
; GFX9-NEXT:    s_mul_i32 s33, s2, s9
; GFX9-NEXT:    s_cselect_b32 s31, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s34, s2, s9
; GFX9-NEXT:    s_add_u32 s19, s33, s19
; GFX9-NEXT:    s_addc_u32 s24, s34, s24
; GFX9-NEXT:    s_mul_i32 s34, s3, s8
; GFX9-NEXT:    s_cselect_b32 s33, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s35, s3, s8
; GFX9-NEXT:    s_add_u32 s19, s34, s19
; GFX9-NEXT:    s_addc_u32 s24, s35, s24
; GFX9-NEXT:    s_cselect_b32 s34, 1, 0
; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
; GFX9-NEXT:    s_addc_u32 s19, s22, s19
; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
; GFX9-NEXT:    s_addc_u32 s20, s20, 0
; GFX9-NEXT:    s_cmp_lg_u32 s22, 0
; GFX9-NEXT:    s_addc_u32 s20, s20, s24
; GFX9-NEXT:    s_mul_i32 s22, s0, s14
; GFX9-NEXT:    s_mul_i32 s24, s1, s13
; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s23, s0, s14
; GFX9-NEXT:    s_mul_hi_u32 s35, s1, s13
; GFX9-NEXT:    s_add_u32 s22, s24, s22
; GFX9-NEXT:    s_addc_u32 s23, s35, s23
; GFX9-NEXT:    s_mul_i32 s24, s2, s12
; GFX9-NEXT:    s_mul_hi_u32 s35, s2, s12
; GFX9-NEXT:    s_add_u32 s22, s24, s22
; GFX9-NEXT:    s_addc_u32 s23, s35, s23
; GFX9-NEXT:    s_mul_i32 s24, s3, s11
; GFX9-NEXT:    s_mul_hi_u32 s35, s3, s11
; GFX9-NEXT:    s_add_u32 s22, s24, s22
; GFX9-NEXT:    s_addc_u32 s23, s35, s23
; GFX9-NEXT:    s_mul_i32 s24, s4, s10
; GFX9-NEXT:    s_mul_hi_u32 s35, s4, s10
; GFX9-NEXT:    s_add_u32 s22, s24, s22
; GFX9-NEXT:    s_addc_u32 s23, s35, s23
; GFX9-NEXT:    s_mul_i32 s24, s5, s9
; GFX9-NEXT:    s_mul_hi_u32 s35, s5, s9
; GFX9-NEXT:    s_add_u32 s22, s24, s22
; GFX9-NEXT:    s_addc_u32 s23, s35, s23
; GFX9-NEXT:    s_mul_i32 s24, s6, s8
; GFX9-NEXT:    s_mul_hi_u32 s35, s6, s8
; GFX9-NEXT:    s_add_u32 s22, s24, s22
; GFX9-NEXT:    s_addc_u32 s23, s35, s23
; GFX9-NEXT:    s_mul_i32 s24, s0, s13
; GFX9-NEXT:    s_mul_hi_u32 s35, s0, s13
; GFX9-NEXT:    s_add_u32 s24, s24, s25
; GFX9-NEXT:    s_addc_u32 s22, s35, s22
; GFX9-NEXT:    s_mul_i32 s35, s1, s12
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s36, s1, s12
; GFX9-NEXT:    s_add_u32 s24, s35, s24
; GFX9-NEXT:    s_addc_u32 s22, s36, s22
; GFX9-NEXT:    s_mul_i32 s36, s2, s11
; GFX9-NEXT:    s_cselect_b32 s35, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s37, s2, s11
; GFX9-NEXT:    s_add_u32 s24, s36, s24
; GFX9-NEXT:    s_addc_u32 s22, s37, s22
; GFX9-NEXT:    s_mul_i32 s37, s3, s10
; GFX9-NEXT:    s_cselect_b32 s36, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s38, s3, s10
; GFX9-NEXT:    s_add_u32 s24, s37, s24
; GFX9-NEXT:    s_addc_u32 s22, s38, s22
; GFX9-NEXT:    s_mul_i32 s38, s4, s9
; GFX9-NEXT:    s_cselect_b32 s37, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s39, s4, s9
; GFX9-NEXT:    s_add_u32 s24, s38, s24
; GFX9-NEXT:    s_addc_u32 s22, s39, s22
; GFX9-NEXT:    s_mul_i32 s39, s5, s8
; GFX9-NEXT:    s_cselect_b32 s38, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s40, s5, s8
; GFX9-NEXT:    s_add_u32 s24, s39, s24
; GFX9-NEXT:    s_addc_u32 s22, s40, s22
; GFX9-NEXT:    s_cselect_b32 s39, 1, 0
; GFX9-NEXT:    s_cmp_lg_u32 s31, 0
; GFX9-NEXT:    s_addc_u32 s30, s30, 0
; GFX9-NEXT:    s_cmp_lg_u32 s33, 0
; GFX9-NEXT:    s_addc_u32 s30, s30, 0
; GFX9-NEXT:    s_cmp_lg_u32 s34, 0
; GFX9-NEXT:    s_addc_u32 s30, s30, 0
; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
; GFX9-NEXT:    s_addc_u32 s21, s30, s24
; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
; GFX9-NEXT:    s_addc_u32 s26, s26, 0
; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
; GFX9-NEXT:    s_addc_u32 s26, s26, 0
; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
; GFX9-NEXT:    s_addc_u32 s26, s26, 0
; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
; GFX9-NEXT:    s_mul_i32 s16, s0, s8
; GFX9-NEXT:    s_addc_u32 s22, s26, s22
; GFX9-NEXT:    s_mul_i32 s0, s0, s15
; GFX9-NEXT:    s_addc_u32 s0, s23, s0
; GFX9-NEXT:    s_mul_i32 s1, s1, s14
; GFX9-NEXT:    s_cmp_lg_u32 s39, 0
; GFX9-NEXT:    s_addc_u32 s0, s0, s1
; GFX9-NEXT:    s_mul_i32 s2, s2, s13
; GFX9-NEXT:    s_cmp_lg_u32 s38, 0
; GFX9-NEXT:    s_addc_u32 s0, s0, s2
; GFX9-NEXT:    s_mul_i32 s3, s3, s12
; GFX9-NEXT:    s_cmp_lg_u32 s37, 0
; GFX9-NEXT:    s_addc_u32 s0, s0, s3
; GFX9-NEXT:    s_mul_i32 s4, s4, s11
; GFX9-NEXT:    s_cmp_lg_u32 s36, 0
; GFX9-NEXT:    s_addc_u32 s0, s0, s4
; GFX9-NEXT:    s_mul_i32 s5, s5, s10
; GFX9-NEXT:    s_cmp_lg_u32 s35, 0
; GFX9-NEXT:    s_addc_u32 s0, s0, s5
; GFX9-NEXT:    s_mul_i32 s6, s6, s9
; GFX9-NEXT:    s_cmp_lg_u32 s25, 0
; GFX9-NEXT:    s_addc_u32 s0, s0, s6
; GFX9-NEXT:    s_mul_i32 s7, s7, s8
; GFX9-NEXT:    s_add_u32 s7, s7, s0
; GFX9-NEXT:    s_mov_b32 s0, s16
; GFX9-NEXT:    s_mov_b32 s1, s17
; GFX9-NEXT:    s_mov_b32 s2, s18
; GFX9-NEXT:    s_mov_b32 s3, s19
; GFX9-NEXT:    s_mov_b32 s4, s20
; GFX9-NEXT:    s_mov_b32 s5, s21
; GFX9-NEXT:    s_mov_b32 s6, s22
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i256:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_i32 s17, s0, s10
; GFX10PLUS-NEXT:    s_mul_i32 s19, s1, s9
; GFX10PLUS-NEXT:    s_mul_hi_u32 s18, s0, s10
; GFX10PLUS-NEXT:    s_mul_hi_u32 s20, s1, s9
; GFX10PLUS-NEXT:    s_add_u32 s17, s19, s17
; GFX10PLUS-NEXT:    s_addc_u32 s18, s20, s18
; GFX10PLUS-NEXT:    s_mul_i32 s20, s2, s8
; GFX10PLUS-NEXT:    s_mul_hi_u32 s21, s2, s8
; GFX10PLUS-NEXT:    s_cselect_b32 s19, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s17, s20, s17
; GFX10PLUS-NEXT:    s_mul_hi_u32 s16, s0, s8
; GFX10PLUS-NEXT:    s_addc_u32 s18, s21, s18
; GFX10PLUS-NEXT:    s_mul_i32 s21, s0, s9
; GFX10PLUS-NEXT:    s_mul_hi_u32 s22, s0, s9
; GFX10PLUS-NEXT:    s_cselect_b32 s20, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s16, s21, s16
; GFX10PLUS-NEXT:    s_addc_u32 s17, s22, s17
; GFX10PLUS-NEXT:    s_mul_i32 s22, s1, s8
; GFX10PLUS-NEXT:    s_mul_hi_u32 s23, s1, s8
; GFX10PLUS-NEXT:    s_cselect_b32 s21, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s16, s22, s16
; GFX10PLUS-NEXT:    s_addc_u32 s17, s23, s17
; GFX10PLUS-NEXT:    s_mul_i32 s23, s0, s12
; GFX10PLUS-NEXT:    s_mul_i32 s25, s1, s11
; GFX10PLUS-NEXT:    s_mul_hi_u32 s24, s0, s12
; GFX10PLUS-NEXT:    s_mul_hi_u32 s26, s1, s11
; GFX10PLUS-NEXT:    s_cselect_b32 s22, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s25, s23
; GFX10PLUS-NEXT:    s_addc_u32 s24, s26, s24
; GFX10PLUS-NEXT:    s_mul_i32 s26, s2, s10
; GFX10PLUS-NEXT:    s_mul_hi_u32 s27, s2, s10
; GFX10PLUS-NEXT:    s_cselect_b32 s25, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s26, s23
; GFX10PLUS-NEXT:    s_addc_u32 s24, s27, s24
; GFX10PLUS-NEXT:    s_mul_i32 s27, s3, s9
; GFX10PLUS-NEXT:    s_mul_hi_u32 s28, s3, s9
; GFX10PLUS-NEXT:    s_cselect_b32 s26, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s27, s23
; GFX10PLUS-NEXT:    s_addc_u32 s24, s28, s24
; GFX10PLUS-NEXT:    s_mul_i32 s28, s4, s8
; GFX10PLUS-NEXT:    s_mul_hi_u32 s29, s4, s8
; GFX10PLUS-NEXT:    s_cselect_b32 s27, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s28, s23
; GFX10PLUS-NEXT:    s_addc_u32 s24, s29, s24
; GFX10PLUS-NEXT:    s_mul_i32 s29, s0, s11
; GFX10PLUS-NEXT:    s_mul_hi_u32 s30, s0, s11
; GFX10PLUS-NEXT:    s_cselect_b32 s28, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s18, s29, s18
; GFX10PLUS-NEXT:    s_addc_u32 s23, s30, s23
; GFX10PLUS-NEXT:    s_mul_i32 s30, s1, s10
; GFX10PLUS-NEXT:    s_mul_hi_u32 s31, s1, s10
; GFX10PLUS-NEXT:    s_cselect_b32 s29, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s18, s30, s18
; GFX10PLUS-NEXT:    s_addc_u32 s23, s31, s23
; GFX10PLUS-NEXT:    s_mul_i32 s31, s2, s9
; GFX10PLUS-NEXT:    s_mul_hi_u32 s33, s2, s9
; GFX10PLUS-NEXT:    s_cselect_b32 s30, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s18, s31, s18
; GFX10PLUS-NEXT:    s_addc_u32 s23, s33, s23
; GFX10PLUS-NEXT:    s_mul_i32 s33, s3, s8
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s3, s8
; GFX10PLUS-NEXT:    s_cselect_b32 s31, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s18, s33, s18
; GFX10PLUS-NEXT:    s_addc_u32 s23, s34, s23
; GFX10PLUS-NEXT:    s_cselect_b32 s33, 1, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s22, 0
; GFX10PLUS-NEXT:    s_mul_hi_u32 s22, s0, s14
; GFX10PLUS-NEXT:    s_addc_u32 s18, s21, s18
; GFX10PLUS-NEXT:    s_cselect_b32 s21, 1, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s20, 0
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s1, s13
; GFX10PLUS-NEXT:    s_addc_u32 s19, s19, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s21, 0
; GFX10PLUS-NEXT:    s_mul_i32 s21, s0, s14
; GFX10PLUS-NEXT:    s_addc_u32 s19, s19, s23
; GFX10PLUS-NEXT:    s_mul_i32 s23, s1, s13
; GFX10PLUS-NEXT:    s_cselect_b32 s20, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT:    s_mul_i32 s23, s2, s12
; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s2, s12
; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT:    s_mul_i32 s23, s3, s11
; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s3, s11
; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT:    s_mul_i32 s23, s4, s10
; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s4, s10
; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT:    s_mul_i32 s23, s5, s9
; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s5, s9
; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT:    s_mul_i32 s23, s6, s8
; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s6, s8
; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT:    s_mul_i32 s23, s0, s13
; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s0, s13
; GFX10PLUS-NEXT:    s_add_u32 s23, s23, s24
; GFX10PLUS-NEXT:    s_addc_u32 s21, s34, s21
; GFX10PLUS-NEXT:    s_mul_i32 s34, s1, s12
; GFX10PLUS-NEXT:    s_mul_hi_u32 s35, s1, s12
; GFX10PLUS-NEXT:    s_cselect_b32 s24, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s34, s23
; GFX10PLUS-NEXT:    s_addc_u32 s21, s35, s21
; GFX10PLUS-NEXT:    s_mul_i32 s35, s2, s11
; GFX10PLUS-NEXT:    s_mul_hi_u32 s36, s2, s11
; GFX10PLUS-NEXT:    s_cselect_b32 s34, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s35, s23
; GFX10PLUS-NEXT:    s_addc_u32 s21, s36, s21
; GFX10PLUS-NEXT:    s_mul_i32 s36, s3, s10
; GFX10PLUS-NEXT:    s_mul_hi_u32 s37, s3, s10
; GFX10PLUS-NEXT:    s_cselect_b32 s35, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s36, s23
; GFX10PLUS-NEXT:    s_addc_u32 s21, s37, s21
; GFX10PLUS-NEXT:    s_mul_i32 s37, s4, s9
; GFX10PLUS-NEXT:    s_mul_hi_u32 s38, s4, s9
; GFX10PLUS-NEXT:    s_cselect_b32 s36, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s37, s23
; GFX10PLUS-NEXT:    s_addc_u32 s21, s38, s21
; GFX10PLUS-NEXT:    s_mul_i32 s38, s5, s8
; GFX10PLUS-NEXT:    s_mul_hi_u32 s39, s5, s8
; GFX10PLUS-NEXT:    s_cselect_b32 s37, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s38, s23
; GFX10PLUS-NEXT:    s_addc_u32 s21, s39, s21
; GFX10PLUS-NEXT:    s_cselect_b32 s38, 1, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s30, 0
; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s14
; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s31, 0
; GFX10PLUS-NEXT:    s_mul_i32 s2, s2, s13
; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s33, 0
; GFX10PLUS-NEXT:    s_mul_i32 s3, s3, s12
; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s20, 0
; GFX10PLUS-NEXT:    s_mul_i32 s4, s4, s11
; GFX10PLUS-NEXT:    s_addc_u32 s20, s29, s23
; GFX10PLUS-NEXT:    s_cselect_b32 s23, 1, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s26, 0
; GFX10PLUS-NEXT:    s_mul_i32 s26, s0, s15
; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s27, 0
; GFX10PLUS-NEXT:    s_mul_i32 s5, s5, s10
; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s28, 0
; GFX10PLUS-NEXT:    s_mul_i32 s6, s6, s9
; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s23, 0
; GFX10PLUS-NEXT:    s_mul_i32 s7, s7, s8
; GFX10PLUS-NEXT:    s_addc_u32 s15, s25, s21
; GFX10PLUS-NEXT:    s_addc_u32 s21, s22, s26
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s38, 0
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s8
; GFX10PLUS-NEXT:    s_addc_u32 s1, s21, s1
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s37, 0
; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s2
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s36, 0
; GFX10PLUS-NEXT:    s_mov_b32 s2, s17
; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s3
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s35, 0
; GFX10PLUS-NEXT:    s_mov_b32 s3, s18
; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s4
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s34, 0
; GFX10PLUS-NEXT:    s_mov_b32 s4, s19
; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s5
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s24, 0
; GFX10PLUS-NEXT:    s_mov_b32 s5, s20
; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s6
; GFX10PLUS-NEXT:    s_mov_b32 s6, s15
; GFX10PLUS-NEXT:    s_add_i32 s7, s1, s7
; GFX10PLUS-NEXT:    s_mov_b32 s1, s16
; GFX10PLUS-NEXT:    ; return to shader part epilog
  %result = mul i256 %num, %den
  %cast = bitcast i256 %result to <8 x i32>
  ret <8 x i32> %cast
}

define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7-LABEL: v_mul_i256:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_mov_b32_e32 v16, v0
; GFX7-NEXT:    v_mov_b32_e32 v17, v1
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
; GFX7-NEXT:    v_mul_lo_u32 v27, v3, v12
; GFX7-NEXT:    v_mul_lo_u32 v26, v5, v10
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19]
; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
; GFX7-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
; GFX7-NEXT:    v_addc_u32_e32 v22, vcc, 0, v20, vcc
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
; GFX7-NEXT:    v_addc_u32_e32 v24, vcc, 0, v22, vcc
; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0
; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23]
; GFX7-NEXT:    v_mov_b32_e32 v1, v18
; GFX7-NEXT:    v_mov_b32_e32 v18, v19
; GFX7-NEXT:    v_mov_b32_e32 v19, v20
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19]
; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23]
; GFX7-NEXT:    v_addc_u32_e64 v25, s[4:5], 0, v0, s[4:5]
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19]
; GFX7-NEXT:    v_mov_b32_e32 v0, v23
; GFX7-NEXT:    v_mul_lo_u32 v23, v4, v11
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
; GFX7-NEXT:    v_mul_lo_u32 v13, v2, v13
; GFX7-NEXT:    v_mul_lo_u32 v20, v6, v9
; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12]
; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1]
; GFX7-NEXT:    v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0
; GFX7-NEXT:    v_mov_b32_e32 v2, v22
; GFX7-NEXT:    v_addc_u32_e64 v10, s[12:13], 0, v4, s[12:13]
; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2]
; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
; GFX7-NEXT:    v_addc_u32_e64 v10, s[12:13], 0, v10, s[12:13]
; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[16:17]
; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2]
; GFX7-NEXT:    v_addc_u32_e64 v3, s[12:13], v11, v3, s[12:13]
; GFX7-NEXT:    v_mul_lo_u32 v11, v16, v15
; GFX7-NEXT:    v_mul_lo_u32 v9, v17, v14
; GFX7-NEXT:    v_addc_u32_e64 v4, s[12:13], v25, v4, s[12:13]
; GFX7-NEXT:    v_addc_u32_e64 v5, s[12:13], v10, v5, s[12:13]
; GFX7-NEXT:    v_addc_u32_e64 v6, s[12:13], v24, v6, s[12:13]
; GFX7-NEXT:    v_addc_u32_e64 v10, s[12:13], v21, v11, s[12:13]
; GFX7-NEXT:    v_addc_u32_e64 v9, s[12:13], v10, v9, s[14:15]
; GFX7-NEXT:    v_addc_u32_e64 v9, s[10:11], v9, v13, s[10:11]
; GFX7-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v27, s[8:9]
; GFX7-NEXT:    v_addc_u32_e64 v9, s[6:7], v9, v23, s[6:7]
; GFX7-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v26, s[4:5]
; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mov_b32_e32 v16, v0
; GFX8-NEXT:    v_mov_b32_e32 v17, v1
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
; GFX8-NEXT:    v_mul_lo_u32 v27, v3, v12
; GFX8-NEXT:    v_mul_lo_u32 v26, v5, v10
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19]
; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v20, vcc
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, 0, v22, vcc
; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0
; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23]
; GFX8-NEXT:    v_mov_b32_e32 v1, v18
; GFX8-NEXT:    v_mov_b32_e32 v18, v19
; GFX8-NEXT:    v_mov_b32_e32 v19, v20
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19]
; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23]
; GFX8-NEXT:    v_addc_u32_e64 v25, s[4:5], 0, v0, s[4:5]
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19]
; GFX8-NEXT:    v_mov_b32_e32 v0, v23
; GFX8-NEXT:    v_mul_lo_u32 v23, v4, v11
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
; GFX8-NEXT:    v_mul_lo_u32 v13, v2, v13
; GFX8-NEXT:    v_mul_lo_u32 v20, v6, v9
; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12]
; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1]
; GFX8-NEXT:    v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0
; GFX8-NEXT:    v_mov_b32_e32 v2, v22
; GFX8-NEXT:    v_addc_u32_e64 v10, s[12:13], 0, v4, s[12:13]
; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2]
; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
; GFX8-NEXT:    v_addc_u32_e64 v10, s[12:13], 0, v10, s[12:13]
; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[16:17]
; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2]
; GFX8-NEXT:    v_addc_u32_e64 v3, s[12:13], v11, v3, s[12:13]
; GFX8-NEXT:    v_mul_lo_u32 v11, v16, v15
; GFX8-NEXT:    v_mul_lo_u32 v9, v17, v14
; GFX8-NEXT:    v_addc_u32_e64 v4, s[12:13], v25, v4, s[12:13]
; GFX8-NEXT:    v_addc_u32_e64 v5, s[12:13], v10, v5, s[12:13]
; GFX8-NEXT:    v_addc_u32_e64 v6, s[12:13], v24, v6, s[12:13]
; GFX8-NEXT:    v_addc_u32_e64 v10, s[12:13], v21, v11, s[12:13]
; GFX8-NEXT:    v_addc_u32_e64 v9, s[12:13], v10, v9, s[14:15]
; GFX8-NEXT:    v_addc_u32_e64 v9, s[10:11], v9, v13, s[10:11]
; GFX8-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v27, s[8:9]
; GFX8-NEXT:    v_addc_u32_e64 v9, s[6:7], v9, v23, s[6:7]
; GFX8-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v26, s[4:5]
; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v16, v0
; GFX9-NEXT:    v_mov_b32_e32 v17, v1
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
; GFX9-NEXT:    v_mul_lo_u32 v27, v3, v12
; GFX9-NEXT:    v_mul_lo_u32 v26, v5, v10
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19]
; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
; GFX9-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
; GFX9-NEXT:    v_addc_co_u32_e32 v22, vcc, 0, v20, vcc
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
; GFX9-NEXT:    v_addc_co_u32_e32 v24, vcc, 0, v22, vcc
; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0
; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23]
; GFX9-NEXT:    v_mov_b32_e32 v1, v18
; GFX9-NEXT:    v_mov_b32_e32 v18, v19
; GFX9-NEXT:    v_mov_b32_e32 v19, v20
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19]
; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23]
; GFX9-NEXT:    v_addc_co_u32_e64 v25, s[4:5], 0, v0, s[4:5]
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19]
; GFX9-NEXT:    v_mov_b32_e32 v0, v23
; GFX9-NEXT:    v_mul_lo_u32 v23, v4, v11
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
; GFX9-NEXT:    v_mul_lo_u32 v13, v2, v13
; GFX9-NEXT:    v_mul_lo_u32 v20, v6, v9
; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12]
; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1]
; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13]
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0
; GFX9-NEXT:    v_mov_b32_e32 v2, v22
; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[12:13], 0, v4, s[12:13]
; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2]
; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[12:13], 0, v10, s[12:13]
; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[16:17]
; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2]
; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[12:13], v11, v3, s[12:13]
; GFX9-NEXT:    v_mul_lo_u32 v11, v16, v15
; GFX9-NEXT:    v_mul_lo_u32 v9, v17, v14
; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[12:13], v25, v4, s[12:13]
; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v10, v5, s[12:13]
; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[12:13], v24, v6, s[12:13]
; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[12:13], v21, v11, s[12:13]
; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[12:13], v10, v9, s[14:15]
; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[10:11], v9, v13, s[10:11]
; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[8:9], v9, v27, s[8:9]
; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[6:7], v9, v23, s[6:7]
; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], v9, v26, s[4:5]
; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v20, vcc
; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_mov_b32_e32 v16, v0
; GFX10-NEXT:    v_mov_b32_e32 v17, v1
; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v9
; GFX10-NEXT:    v_mul_lo_u32 v28, v5, v10
; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v8
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v14, 0
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v16, v12, 0
; GFX10-NEXT:    v_mul_lo_u32 v30, v17, v14
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s4
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s4, v16, v10, 0
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
; GFX10-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX10-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
; GFX10-NEXT:    v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT:    v_mov_b32_e32 v20, v22
; GFX10-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
; GFX10-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
; GFX10-NEXT:    v_mov_b32_e32 v20, v18
; GFX10-NEXT:    v_mov_b32_e32 v19, v22
; GFX10-NEXT:    v_mul_lo_u32 v22, v16, v15
; GFX10-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v16, v8, 0
; GFX10-NEXT:    v_mul_lo_u32 v20, v4, v11
; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
; GFX10-NEXT:    v_mul_lo_u32 v25, v3, v12
; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
; GFX10-NEXT:    v_mul_lo_u32 v24, v2, v13
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
; GFX10-NEXT:    v_mov_b32_e32 v13, v1
; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
; GFX10-NEXT:    v_mov_b32_e32 v14, v21
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
; GFX10-NEXT:    v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
; GFX10-NEXT:    v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s8
; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s4
; GFX10-NEXT:    v_add_nc_u32_e32 v7, v8, v7
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i256:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
; GFX11-NEXT:    v_mul_lo_u32 v7, v7, v8
; GFX11-NEXT:    v_mul_lo_u32 v27, v6, v9
; GFX11-NEXT:    v_mul_lo_u32 v28, v5, v10
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v14, 0
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], null, v16, v12, 0
; GFX11-NEXT:    v_mul_lo_u32 v30, v17, v14
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], s0, v17, v11, v[18:19]
; GFX11-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
; GFX11-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
; GFX11-NEXT:    v_mad_u64_u32 v[20:21], null, v16, v10, 0
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
; GFX11-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX11-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[22:23], null, v6, v8, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
; GFX11-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s0
; GFX11-NEXT:    v_mov_b32_e32 v20, v22
; GFX11-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
; GFX11-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v16, v13, v[19:20]
; GFX11-NEXT:    v_mov_b32_e32 v20, v18
; GFX11-NEXT:    v_mov_b32_e32 v19, v22
; GFX11-NEXT:    v_mul_lo_u32 v22, v16, v15
; GFX11-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[14:15], s2, v16, v11, v[19:20]
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v8, 0
; GFX11-NEXT:    v_mul_lo_u32 v20, v4, v11
; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], s1, v2, v11, v[24:25]
; GFX11-NEXT:    v_mul_lo_u32 v25, v3, v12
; GFX11-NEXT:    v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15]
; GFX11-NEXT:    v_mov_b32_e32 v14, v21
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], s3, v3, v10, v[18:19]
; GFX11-NEXT:    v_mul_lo_u32 v24, v2, v13
; GFX11-NEXT:    v_mov_b32_e32 v13, v1
; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s2, v2, v9, v[11:12]
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
; GFX11-NEXT:    v_mad_u64_u32 v[10:11], s2, v4, v9, v[18:19]
; GFX11-NEXT:    v_mad_u64_u32 v[12:13], s4, v16, v9, v[13:14]
; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
; GFX11-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v8, v[1:2]
; GFX11-NEXT:    v_add_co_ci_u32_e64 v14, s4, 0, v6, s4
; GFX11-NEXT:    v_mad_u64_u32 v[5:6], s4, v5, v8, v[10:11]
; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s5, v17, v8, v[12:13]
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s5, v23, v22, s5
; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v30, s4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v24, s2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0
; GFX11-NEXT:    v_add_nc_u32_e32 v7, v8, v7
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i256 %num, %den
  ret i256 %result
}