#1 Profiling generated asm
Opened by Kerollmops on Jan 28, 2021, 6:57 PM
// Kerollmops on Jan 28, 2021, 6:57 PM
Without the cpu native flag
35.063ms 96.767ms
cargo asm -- simdpage::avx_search
simdpage::avx_search:
push rbp
mov rbp, rsp
push r15
push r14
push r13
push r12
push rbx
and rsp, -32
sub rsp, 416
mov qword, ptr, [rsp, +, 24], rdi
mov qword, ptr, [rsp, +, 96], rsi
mov qword, ptr, [rsp, +, 104], rsi
mov qword, ptr, [rsp, +, 112], rsi
mov qword, ptr, [rsp, +, 120], rsi
movaps xmm0, xmmword, ptr, [rip, +, LCPI21_0]
movaps xmmword, ptr, [rsp, +, 288], xmm0
movaps xmm0, xmmword, ptr, [rip, +, LCPI21_1]
movaps xmmword, ptr, [rsp, +, 304], xmm0
mov rax, qword, ptr, [rip, +, __ZN3std10std_detect6detect5cache5CACHE17hbe0e9d1b17ddb275E@GOTPCREL]
mov rax, qword, ptr, [rax]
test rax, rax
je LBB21_1
test eax, 16384
je LBB21_10
LBB21_3:
lea rdi, [rsp, +, 32]
lea rsi, [rsp, +, 96]
call core::core_arch::x86::avx::_mm256_loadu_si256
movaps xmm0, xmmword, ptr, [rsp, +, 32]
movaps xmmword, ptr, [rsp, +, 80], xmm0
movaps xmm0, xmmword, ptr, [rsp, +, 48]
movaps xmmword, ptr, [rsp, +, 64], xmm0
mov r13d, 8
xor r12d, r12d
lea rbx, [rsp, +, 320]
lea r14, [rsp, +, 352]
xor r15d, r15d
LBB21_4:
mov rax, qword, ptr, [rsp, +, 24]
lea rsi, [rax, +, 8*r12]
lea rdi, [rsp, +, 32]
call core::core_arch::x86::avx::_mm256_loadu_si256
movaps xmm1, xmmword, ptr, [rsp, +, 32]
movaps xmm2, xmmword, ptr, [rsp, +, 48]
movaps xmm0, xmmword, ptr, [rsp, +, 64]
movaps xmmword, ptr, [rsp, +, 176], xmm0
movaps xmm0, xmmword, ptr, [rsp, +, 80]
movaps xmmword, ptr, [rsp, +, 160], xmm0
movaps xmmword, ptr, [rsp, +, 128], xmm2
movaps xmmword, ptr, [rsp, +, 208], xmm2
movaps xmmword, ptr, [rsp, +, 144], xmm1
movaps xmmword, ptr, [rsp, +, 192], xmm1
mov rdi, rbx
lea rsi, [rsp, +, 160]
lea rdx, [rsp, +, 192]
call core::core_arch::x86::avx2::_mm256_cmpeq_epi64
mov rdi, rbx
call core::core_arch::x86::avx2::_mm256_movemask_epi8
cmp eax, 65279
jg LBB21_12
cmp eax, -16777216
je LBB21_19
cmp eax, 255
jne LBB21_7
jmp LBB21_20
LBB21_12:
cmp eax, 16711680
je LBB21_15
cmp eax, 65280
je LBB21_14
LBB21_7:
movaps xmm0, xmmword, ptr, [rsp, +, 80]
movaps xmmword, ptr, [rsp, +, 224], xmm0
movaps xmm0, xmmword, ptr, [rsp, +, 64]
movaps xmmword, ptr, [rsp, +, 240], xmm0
movaps xmm0, xmmword, ptr, [rsp, +, 144]
movaps xmmword, ptr, [rsp, +, 256], xmm0
movaps xmm0, xmmword, ptr, [rsp, +, 128]
movaps xmmword, ptr, [rsp, +, 272], xmm0
mov rdi, r14
lea rsi, [rsp, +, 224]
lea rdx, [rsp, +, 256]
call core::core_arch::x86::avx2::_mm256_cmpgt_epi64
mov rdi, r14
call core::core_arch::x86::avx2::_mm256_movemask_epi8
cmp eax, 65534
jg LBB21_16
test eax, eax
je LBB21_9
cmp eax, 255
jne LBB21_25
mov eax, 1
cmp r13, 32
jne LBB21_27
jmp LBB21_11
LBB21_16:
cmp eax, 65535
je LBB21_23
cmp eax, 16777215
jne LBB21_25
mov eax, 3
cmp r13, 32
jne LBB21_27
jmp LBB21_11
LBB21_25:
mov eax, 4
cmp r13, 32
jne LBB21_27
jmp LBB21_11
LBB21_9:
xor eax, eax
cmp r13, 32
jne LBB21_27
jmp LBB21_11
LBB21_23:
mov eax, 2
cmp r13, 32
je LBB21_11
LBB21_27:
lea rcx, [r15, +, 4*r15]
add rax, rcx
lea r12, [4*rax]
add r12, qword, ptr, [rsp, +, r13, +, 288]
add r13, 8
mov r15, rax
cmp r12, 625
jb LBB21_4
lea rdx, [rip, +, l___unnamed_26]
mov esi, 625
mov rdi, r12
call core::panicking::panic_bounds_check
LBB21_11:
xor eax, eax
jmp LBB21_21
LBB21_19:
add r12, 3
jmp LBB21_20
LBB21_15:
add r12, 2
jmp LBB21_20
LBB21_14:
inc r12
LBB21_20:
mov eax, 1
LBB21_21:
mov rdx, r12
lea rsp, [rbp, -, 40]
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
LBB21_1:
call std::std_detect::detect::cache::detect_and_initialize
test eax, 16384
jne LBB21_3
LBB21_10:
lea rdi, [rip, +, l___unnamed_27]
lea rdx, [rip, +, l___unnamed_28]
mov esi, 49
call std::panicking::begin_panic
With the cpu native flag
34.142ms 36.923ms
RUSTFLAGS='-C target-cpu=native' cargo asm -- simdpage::avx_search
simdpage::avx_search:
push rbp
mov rbp, rsp
push r14
push rbx
mov rbx, rsi
mov r14, rdi
mov rax, qword, ptr, [rip, +, __ZN3std10std_detect6detect5cache5CACHE17hbe0e9d1b17ddb275E@GOTPCREL]
mov rax, qword, ptr, [rax]
test rax, rax
je LBB17_1
test eax, 16384
je LBB17_58
LBB17_3:
vmovq xmm0, rbx
vpbroadcastq ymm0, xmm0
vmovdqu ymm1, ymmword, ptr, [r14]
vpcmpeqq ymm2, ymm0, ymm1
vpmovmskb ecx, ymm2
mov eax, 1
xor edx, edx
cmp ecx, 65279
jg LBB17_9
cmp ecx, -16777216
je LBB17_23
cmp ecx, 255
jne LBB17_6
LBB17_19:
pop rbx
pop r14
pop rbp
vzeroupper
ret
LBB17_9:
cmp ecx, 16711680
je LBB17_18
cmp ecx, 65280
jne LBB17_6
LBB17_11:
inc rdx
pop rbx
pop r14
pop rbp
vzeroupper
ret
LBB17_6:
vpcmpgtq ymm1, ymm0, ymm1
vpmovmskb ecx, ymm1
cmp ecx, 65534
jg LBB17_12
test ecx, ecx
je LBB17_8
cmp ecx, 255
jne LBB17_17
mov ecx, 1
jmp LBB17_21
LBB17_12:
cmp ecx, 65535
je LBB17_15
cmp ecx, 16777215
jne LBB17_17
mov ecx, 3
jmp LBB17_21
LBB17_1:
call std::std_detect::detect::cache::detect_and_initialize
test eax, 16384
jne LBB17_3
LBB17_58:
lea rdi, [rip, +, l___unnamed_26]
lea rdx, [rip, +, l___unnamed_27]
mov esi, 49
call std::panicking::begin_panic
LBB17_8:
xor ecx, ecx
jmp LBB17_21
LBB17_15:
mov ecx, 2
jmp LBB17_21
LBB17_17:
mov ecx, 4
LBB17_21:
lea rsi, [4*rcx]
lea rdx, [4*rcx, +, 4]
vmovdqu ymm1, ymmword, ptr, [r14, +, 8*rsi, +, 32]
vpcmpeqq ymm2, ymm0, ymm1
vpmovmskb esi, ymm2
cmp esi, 65279
jg LBB17_25
cmp esi, -16777216
je LBB17_23
cmp esi, 255
je LBB17_19
jmp LBB17_27
LBB17_25:
cmp esi, 65280
je LBB17_11
cmp esi, 16711680
jne LBB17_27
LBB17_18:
add rdx, 2
pop rbx
pop r14
pop rbp
vzeroupper
ret
LBB17_27:
vpcmpgtq ymm1, ymm0, ymm1
vpmovmskb esi, ymm1
lea rdx, [rcx, +, 4*rcx]
cmp esi, 65534
jg LBB17_30
test esi, esi
je LBB17_29
cmp esi, 255
jne LBB17_36
mov ecx, 1
jmp LBB17_37
LBB17_30:
cmp esi, 65535
je LBB17_33
cmp esi, 16777215
jne LBB17_36
mov ecx, 3
jmp LBB17_37
LBB17_29:
xor ecx, ecx
jmp LBB17_37
LBB17_33:
mov ecx, 2
jmp LBB17_37
LBB17_36:
mov ecx, 4
LBB17_37:
add rcx, rdx
lea rsi, [4*rcx]
lea rdx, [4*rcx, +, 24]
vmovdqu ymm1, ymmword, ptr, [r14, +, 8*rsi, +, 192]
vpcmpeqq ymm2, ymm0, ymm1
vpmovmskb esi, ymm2
cmp esi, 65279
jg LBB17_40
cmp esi, -16777216
je LBB17_23
cmp esi, 255
je LBB17_19
jmp LBB17_42
LBB17_40:
cmp esi, 65280
je LBB17_11
cmp esi, 16711680
je LBB17_18
LBB17_42:
vpcmpgtq ymm1, ymm0, ymm1
vpmovmskb edx, ymm1
lea rcx, [rcx, +, 4*rcx]
cmp edx, 65534
jg LBB17_45
test edx, edx
je LBB17_44
cmp edx, 255
jne LBB17_51
mov esi, 1
jmp LBB17_52
LBB17_45:
cmp edx, 65535
je LBB17_48
cmp edx, 16777215
jne LBB17_51
mov esi, 3
jmp LBB17_52
LBB17_44:
xor esi, esi
jmp LBB17_52
LBB17_48:
mov esi, 2
jmp LBB17_52
LBB17_51:
mov esi, 4
LBB17_52:
add rsi, rcx
lea rdx, [4*rsi, +, 124]
shl rsi, 2
vpcmpeqq ymm0, ymm0, ymmword, ptr, [r14, +, 8*rsi, +, 992]
vpmovmskb ecx, ymm0
cmp ecx, 65279
jg LBB17_55
cmp ecx, -16777216
je LBB17_23
cmp ecx, 255
je LBB17_19
jmp LBB17_57
LBB17_23:
add rdx, 3
pop rbx
pop r14
pop rbp
vzeroupper
ret
LBB17_55:
cmp ecx, 65280
je LBB17_11
cmp ecx, 16711680
je LBB17_18
LBB17_57:
xor eax, eax
pop rbx
pop r14
pop rbp
vzeroupper
ret
IPLJS3L5XIUMWSJVWMUISC66OEHIZZMB6RTE2K4TJXR2HL55MLPAC