35.063ms 96.767ms
cargo asm -- simdpage::avx_search
simdpage::avx_search: push rbp mov rbp, rsp push r15 push r14 push r13 push r12 push rbx and rsp, -32 sub rsp, 416 mov qword, ptr, [rsp, +, 24], rdi mov qword, ptr, [rsp, +, 96], rsi mov qword, ptr, [rsp, +, 104], rsi mov qword, ptr, [rsp, +, 112], rsi mov qword, ptr, [rsp, +, 120], rsi movaps xmm0, xmmword, ptr, [rip, +, LCPI21_0] movaps xmmword, ptr, [rsp, +, 288], xmm0 movaps xmm0, xmmword, ptr, [rip, +, LCPI21_1] movaps xmmword, ptr, [rsp, +, 304], xmm0 mov rax, qword, ptr, [rip, +, __ZN3std10std_detect6detect5cache5CACHE17hbe0e9d1b17ddb275E@GOTPCREL] mov rax, qword, ptr, [rax] test rax, rax je LBB21_1 test eax, 16384 je LBB21_10 LBB21_3: lea rdi, [rsp, +, 32] lea rsi, [rsp, +, 96] call core::core_arch::x86::avx::_mm256_loadu_si256 movaps xmm0, xmmword, ptr, [rsp, +, 32] movaps xmmword, ptr, [rsp, +, 80], xmm0 movaps xmm0, xmmword, ptr, [rsp, +, 48] movaps xmmword, ptr, [rsp, +, 64], xmm0 mov r13d, 8 xor r12d, r12d lea rbx, [rsp, +, 320] lea r14, [rsp, +, 352] xor r15d, r15d LBB21_4: mov rax, qword, ptr, [rsp, +, 24] lea rsi, [rax, +, 8*r12] lea rdi, [rsp, +, 32] call core::core_arch::x86::avx::_mm256_loadu_si256 movaps xmm1, xmmword, ptr, [rsp, +, 32] movaps xmm2, xmmword, ptr, [rsp, +, 48] movaps xmm0, xmmword, ptr, [rsp, +, 64] movaps xmmword, ptr, [rsp, +, 176], xmm0 movaps xmm0, xmmword, ptr, [rsp, +, 80] movaps xmmword, ptr, [rsp, +, 160], xmm0 movaps xmmword, ptr, [rsp, +, 128], xmm2 movaps xmmword, ptr, [rsp, +, 208], xmm2 movaps xmmword, ptr, [rsp, +, 144], xmm1 movaps xmmword, ptr, [rsp, +, 192], xmm1 mov rdi, rbx lea rsi, [rsp, +, 160] lea rdx, [rsp, +, 192] call core::core_arch::x86::avx2::_mm256_cmpeq_epi64 mov rdi, rbx call core::core_arch::x86::avx2::_mm256_movemask_epi8 cmp eax, 65279 jg LBB21_12 cmp eax, -16777216 je LBB21_19 cmp eax, 255 jne LBB21_7 jmp LBB21_20 LBB21_12: cmp eax, 16711680 je LBB21_15 cmp eax, 65280 je LBB21_14 LBB21_7: movaps xmm0, xmmword, ptr, [rsp, +, 80] movaps xmmword, ptr, [rsp, +, 224], xmm0 movaps xmm0, xmmword, ptr, [rsp, +, 64] movaps xmmword, ptr, [rsp, +, 240], xmm0 movaps xmm0, xmmword, ptr, [rsp, +, 144] movaps xmmword, ptr, [rsp, +, 256], xmm0 movaps xmm0, xmmword, ptr, [rsp, +, 128] movaps xmmword, ptr, [rsp, +, 272], xmm0 mov rdi, r14 lea rsi, [rsp, +, 224] lea rdx, [rsp, +, 256] call core::core_arch::x86::avx2::_mm256_cmpgt_epi64 mov rdi, r14 call core::core_arch::x86::avx2::_mm256_movemask_epi8 cmp eax, 65534 jg LBB21_16 test eax, eax je LBB21_9 cmp eax, 255 jne LBB21_25 mov eax, 1 cmp r13, 32 jne LBB21_27 jmp LBB21_11 LBB21_16: cmp eax, 65535 je LBB21_23 cmp eax, 16777215 jne LBB21_25 mov eax, 3 cmp r13, 32 jne LBB21_27 jmp LBB21_11 LBB21_25: mov eax, 4 cmp r13, 32 jne LBB21_27 jmp LBB21_11 LBB21_9: xor eax, eax cmp r13, 32 jne LBB21_27 jmp LBB21_11 LBB21_23: mov eax, 2 cmp r13, 32 je LBB21_11 LBB21_27: lea rcx, [r15, +, 4*r15] add rax, rcx lea r12, [4*rax] add r12, qword, ptr, [rsp, +, r13, +, 288] add r13, 8 mov r15, rax cmp r12, 625 jb LBB21_4 lea rdx, [rip, +, l___unnamed_26] mov esi, 625 mov rdi, r12 call core::panicking::panic_bounds_check LBB21_11: xor eax, eax jmp LBB21_21 LBB21_19: add r12, 3 jmp LBB21_20 LBB21_15: add r12, 2 jmp LBB21_20 LBB21_14: inc r12 LBB21_20: mov eax, 1 LBB21_21: mov rdx, r12 lea rsp, [rbp, -, 40] pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret LBB21_1: call std::std_detect::detect::cache::detect_and_initialize test eax, 16384 jne LBB21_3 LBB21_10: lea rdi, [rip, +, l___unnamed_27] lea rdx, [rip, +, l___unnamed_28] mov esi, 49 call std::panicking::begin_panic
34.142ms 36.923ms
RUSTFLAGS='-C target-cpu=native' cargo asm -- simdpage::avx_search
simdpage::avx_search: push rbp mov rbp, rsp push r14 push rbx mov rbx, rsi mov r14, rdi mov rax, qword, ptr, [rip, +, __ZN3std10std_detect6detect5cache5CACHE17hbe0e9d1b17ddb275E@GOTPCREL] mov rax, qword, ptr, [rax] test rax, rax je LBB17_1 test eax, 16384 je LBB17_58 LBB17_3: vmovq xmm0, rbx vpbroadcastq ymm0, xmm0 vmovdqu ymm1, ymmword, ptr, [r14] vpcmpeqq ymm2, ymm0, ymm1 vpmovmskb ecx, ymm2 mov eax, 1 xor edx, edx cmp ecx, 65279 jg LBB17_9 cmp ecx, -16777216 je LBB17_23 cmp ecx, 255 jne LBB17_6 LBB17_19: pop rbx pop r14 pop rbp vzeroupper ret LBB17_9: cmp ecx, 16711680 je LBB17_18 cmp ecx, 65280 jne LBB17_6 LBB17_11: inc rdx pop rbx pop r14 pop rbp vzeroupper ret LBB17_6: vpcmpgtq ymm1, ymm0, ymm1 vpmovmskb ecx, ymm1 cmp ecx, 65534 jg LBB17_12 test ecx, ecx je LBB17_8 cmp ecx, 255 jne LBB17_17 mov ecx, 1 jmp LBB17_21 LBB17_12: cmp ecx, 65535 je LBB17_15 cmp ecx, 16777215 jne LBB17_17 mov ecx, 3 jmp LBB17_21 LBB17_1: call std::std_detect::detect::cache::detect_and_initialize test eax, 16384 jne LBB17_3 LBB17_58: lea rdi, [rip, +, l___unnamed_26] lea rdx, [rip, +, l___unnamed_27] mov esi, 49 call std::panicking::begin_panic LBB17_8: xor ecx, ecx jmp LBB17_21 LBB17_15: mov ecx, 2 jmp LBB17_21 LBB17_17: mov ecx, 4 LBB17_21: lea rsi, [4*rcx] lea rdx, [4*rcx, +, 4] vmovdqu ymm1, ymmword, ptr, [r14, +, 8*rsi, +, 32] vpcmpeqq ymm2, ymm0, ymm1 vpmovmskb esi, ymm2 cmp esi, 65279 jg LBB17_25 cmp esi, -16777216 je LBB17_23 cmp esi, 255 je LBB17_19 jmp LBB17_27 LBB17_25: cmp esi, 65280 je LBB17_11 cmp esi, 16711680 jne LBB17_27 LBB17_18: add rdx, 2 pop rbx pop r14 pop rbp vzeroupper ret LBB17_27: vpcmpgtq ymm1, ymm0, ymm1 vpmovmskb esi, ymm1 lea rdx, [rcx, +, 4*rcx] cmp esi, 65534 jg LBB17_30 test esi, esi je LBB17_29 cmp esi, 255 jne LBB17_36 mov ecx, 1 jmp LBB17_37 LBB17_30: cmp esi, 65535 je LBB17_33 cmp esi, 16777215 jne LBB17_36 mov ecx, 3 jmp LBB17_37 LBB17_29: xor ecx, ecx jmp LBB17_37 LBB17_33: mov ecx, 2 jmp LBB17_37 LBB17_36: mov ecx, 4 LBB17_37: add rcx, rdx lea rsi, [4*rcx] lea rdx, [4*rcx, +, 24] vmovdqu ymm1, ymmword, ptr, [r14, +, 8*rsi, +, 192] vpcmpeqq ymm2, ymm0, ymm1 vpmovmskb esi, ymm2 cmp esi, 65279 jg LBB17_40 cmp esi, -16777216 je LBB17_23 cmp esi, 255 je LBB17_19 jmp LBB17_42 LBB17_40: cmp esi, 65280 je LBB17_11 cmp esi, 16711680 je LBB17_18 LBB17_42: vpcmpgtq ymm1, ymm0, ymm1 vpmovmskb edx, ymm1 lea rcx, [rcx, +, 4*rcx] cmp edx, 65534 jg LBB17_45 test edx, edx je LBB17_44 cmp edx, 255 jne LBB17_51 mov esi, 1 jmp LBB17_52 LBB17_45: cmp edx, 65535 je LBB17_48 cmp edx, 16777215 jne LBB17_51 mov esi, 3 jmp LBB17_52 LBB17_44: xor esi, esi jmp LBB17_52 LBB17_48: mov esi, 2 jmp LBB17_52 LBB17_51: mov esi, 4 LBB17_52: add rsi, rcx lea rdx, [4*rsi, +, 124] shl rsi, 2 vpcmpeqq ymm0, ymm0, ymmword, ptr, [r14, +, 8*rsi, +, 992] vpmovmskb ecx, ymm0 cmp ecx, 65279 jg LBB17_55 cmp ecx, -16777216 je LBB17_23 cmp ecx, 255 je LBB17_19 jmp LBB17_57 LBB17_23: add rdx, 3 pop rbx pop r14 pop rbp vzeroupper ret LBB17_55: cmp ecx, 65280 je LBB17_11 cmp ecx, 16711680 je LBB17_18 LBB17_57: xor eax, eax pop rbx pop r14 pop rbp vzeroupper ret
IPLJS3L5XIUMWSJVWMUISC66OEHIZZMB6RTE2K4TJXR2HL55MLPAC
Without the cpu native flag
35.063ms 96.767ms
cargo asm -- simdpage::avx_search
With the cpu native flag
34.142ms 36.923ms
RUSTFLAGS='-C target-cpu=native' cargo asm -- simdpage::avx_search