#1 Profiling generated asm

Opened by Kerollmops on January 28, 2021
Kerollmops on January 28, 2021
Without the cpu native flag

35.063ms 96.767ms

cargo asm -- simdpage::avx_search

simdpage::avx_search:
 push    rbp
 mov     rbp, rsp
 push    r15
 push    r14
 push    r13
 push    r12
 push    rbx
 and     rsp, -32
 sub     rsp, 416
 mov     qword, ptr, [rsp, +, 24], rdi
 mov     qword, ptr, [rsp, +, 96], rsi
 mov     qword, ptr, [rsp, +, 104], rsi
 mov     qword, ptr, [rsp, +, 112], rsi
 mov     qword, ptr, [rsp, +, 120], rsi
 movaps  xmm0, xmmword, ptr, [rip, +, LCPI21_0]
 movaps  xmmword, ptr, [rsp, +, 288], xmm0
 movaps  xmm0, xmmword, ptr, [rip, +, LCPI21_1]
 movaps  xmmword, ptr, [rsp, +, 304], xmm0
 mov     rax, qword, ptr, [rip, +, __ZN3std10std_detect6detect5cache5CACHE17hbe0e9d1b17ddb275E@GOTPCREL]
 mov     rax, qword, ptr, [rax]
 test    rax, rax
 je      LBB21_1
 test    eax, 16384
 je      LBB21_10
LBB21_3:
 lea     rdi, [rsp, +, 32]
 lea     rsi, [rsp, +, 96]
 call    core::core_arch::x86::avx::_mm256_loadu_si256
 movaps  xmm0, xmmword, ptr, [rsp, +, 32]
 movaps  xmmword, ptr, [rsp, +, 80], xmm0
 movaps  xmm0, xmmword, ptr, [rsp, +, 48]
 movaps  xmmword, ptr, [rsp, +, 64], xmm0
 mov     r13d, 8
 xor     r12d, r12d
 lea     rbx, [rsp, +, 320]
 lea     r14, [rsp, +, 352]
 xor     r15d, r15d
LBB21_4:
 mov     rax, qword, ptr, [rsp, +, 24]
 lea     rsi, [rax, +, 8*r12]
 lea     rdi, [rsp, +, 32]
 call    core::core_arch::x86::avx::_mm256_loadu_si256
 movaps  xmm1, xmmword, ptr, [rsp, +, 32]
 movaps  xmm2, xmmword, ptr, [rsp, +, 48]
 movaps  xmm0, xmmword, ptr, [rsp, +, 64]
 movaps  xmmword, ptr, [rsp, +, 176], xmm0
 movaps  xmm0, xmmword, ptr, [rsp, +, 80]
 movaps  xmmword, ptr, [rsp, +, 160], xmm0
 movaps  xmmword, ptr, [rsp, +, 128], xmm2
 movaps  xmmword, ptr, [rsp, +, 208], xmm2
 movaps  xmmword, ptr, [rsp, +, 144], xmm1
 movaps  xmmword, ptr, [rsp, +, 192], xmm1
 mov     rdi, rbx
 lea     rsi, [rsp, +, 160]
 lea     rdx, [rsp, +, 192]
 call    core::core_arch::x86::avx2::_mm256_cmpeq_epi64
 mov     rdi, rbx
 call    core::core_arch::x86::avx2::_mm256_movemask_epi8
 cmp     eax, 65279
 jg      LBB21_12
 cmp     eax, -16777216
 je      LBB21_19
 cmp     eax, 255
 jne     LBB21_7
 jmp     LBB21_20
LBB21_12:
 cmp     eax, 16711680
 je      LBB21_15
 cmp     eax, 65280
 je      LBB21_14
LBB21_7:
 movaps  xmm0, xmmword, ptr, [rsp, +, 80]
 movaps  xmmword, ptr, [rsp, +, 224], xmm0
 movaps  xmm0, xmmword, ptr, [rsp, +, 64]
 movaps  xmmword, ptr, [rsp, +, 240], xmm0
 movaps  xmm0, xmmword, ptr, [rsp, +, 144]
 movaps  xmmword, ptr, [rsp, +, 256], xmm0
 movaps  xmm0, xmmword, ptr, [rsp, +, 128]
 movaps  xmmword, ptr, [rsp, +, 272], xmm0
 mov     rdi, r14
 lea     rsi, [rsp, +, 224]
 lea     rdx, [rsp, +, 256]
 call    core::core_arch::x86::avx2::_mm256_cmpgt_epi64
 mov     rdi, r14
 call    core::core_arch::x86::avx2::_mm256_movemask_epi8
 cmp     eax, 65534
 jg      LBB21_16
 test    eax, eax
 je      LBB21_9
 cmp     eax, 255
 jne     LBB21_25
 mov     eax, 1
 cmp     r13, 32
 jne     LBB21_27
 jmp     LBB21_11
LBB21_16:
 cmp     eax, 65535
 je      LBB21_23
 cmp     eax, 16777215
 jne     LBB21_25
 mov     eax, 3
 cmp     r13, 32
 jne     LBB21_27
 jmp     LBB21_11
LBB21_25:
 mov     eax, 4
 cmp     r13, 32
 jne     LBB21_27
 jmp     LBB21_11
LBB21_9:
 xor     eax, eax
 cmp     r13, 32
 jne     LBB21_27
 jmp     LBB21_11
LBB21_23:
 mov     eax, 2
 cmp     r13, 32
 je      LBB21_11
LBB21_27:
 lea     rcx, [r15, +, 4*r15]
 add     rax, rcx
 lea     r12, [4*rax]
 add     r12, qword, ptr, [rsp, +, r13, +, 288]
 add     r13, 8
 mov     r15, rax
 cmp     r12, 625
 jb      LBB21_4
 lea     rdx, [rip, +, l___unnamed_26]
 mov     esi, 625
 mov     rdi, r12
 call    core::panicking::panic_bounds_check
LBB21_11:
 xor     eax, eax
 jmp     LBB21_21
LBB21_19:
 add     r12, 3
 jmp     LBB21_20
LBB21_15:
 add     r12, 2
 jmp     LBB21_20
LBB21_14:
 inc     r12
LBB21_20:
 mov     eax, 1
LBB21_21:
 mov     rdx, r12
 lea     rsp, [rbp, -, 40]
 pop     rbx
 pop     r12
 pop     r13
 pop     r14
 pop     r15
 pop     rbp
 ret
LBB21_1:
 call    std::std_detect::detect::cache::detect_and_initialize
 test    eax, 16384
 jne     LBB21_3
LBB21_10:
 lea     rdi, [rip, +, l___unnamed_27]
 lea     rdx, [rip, +, l___unnamed_28]
 mov     esi, 49
 call    std::panicking::begin_panic
With the cpu native flag

34.142ms 36.923ms

RUSTFLAGS='-C target-cpu=native' cargo asm -- simdpage::avx_search

simdpage::avx_search:
 push    rbp
 mov     rbp, rsp
 push    r14
 push    rbx
 mov     rbx, rsi
 mov     r14, rdi
 mov     rax, qword, ptr, [rip, +, __ZN3std10std_detect6detect5cache5CACHE17hbe0e9d1b17ddb275E@GOTPCREL]
 mov     rax, qword, ptr, [rax]
 test    rax, rax
 je      LBB17_1
 test    eax, 16384
 je      LBB17_58
LBB17_3:
 vmovq   xmm0, rbx
 vpbroadcastq ymm0, xmm0
 vmovdqu ymm1, ymmword, ptr, [r14]
 vpcmpeqq ymm2, ymm0, ymm1
 vpmovmskb ecx, ymm2
 mov     eax, 1
 xor     edx, edx
 cmp     ecx, 65279
 jg      LBB17_9
 cmp     ecx, -16777216
 je      LBB17_23
 cmp     ecx, 255
 jne     LBB17_6
LBB17_19:
 pop     rbx
 pop     r14
 pop     rbp
 vzeroupper
 ret
LBB17_9:
 cmp     ecx, 16711680
 je      LBB17_18
 cmp     ecx, 65280
 jne     LBB17_6
LBB17_11:
 inc     rdx
 pop     rbx
 pop     r14
 pop     rbp
 vzeroupper
 ret
LBB17_6:
 vpcmpgtq ymm1, ymm0, ymm1
 vpmovmskb ecx, ymm1
 cmp     ecx, 65534
 jg      LBB17_12
 test    ecx, ecx
 je      LBB17_8
 cmp     ecx, 255
 jne     LBB17_17
 mov     ecx, 1
 jmp     LBB17_21
LBB17_12:
 cmp     ecx, 65535
 je      LBB17_15
 cmp     ecx, 16777215
 jne     LBB17_17
 mov     ecx, 3
 jmp     LBB17_21
LBB17_1:
 call    std::std_detect::detect::cache::detect_and_initialize
 test    eax, 16384
 jne     LBB17_3
LBB17_58:
 lea     rdi, [rip, +, l___unnamed_26]
 lea     rdx, [rip, +, l___unnamed_27]
 mov     esi, 49
 call    std::panicking::begin_panic
LBB17_8:
 xor     ecx, ecx
 jmp     LBB17_21
LBB17_15:
 mov     ecx, 2
 jmp     LBB17_21
LBB17_17:
 mov     ecx, 4
LBB17_21:
 lea     rsi, [4*rcx]
 lea     rdx, [4*rcx, +, 4]
 vmovdqu ymm1, ymmword, ptr, [r14, +, 8*rsi, +, 32]
 vpcmpeqq ymm2, ymm0, ymm1
 vpmovmskb esi, ymm2
 cmp     esi, 65279
 jg      LBB17_25
 cmp     esi, -16777216
 je      LBB17_23
 cmp     esi, 255
 je      LBB17_19
 jmp     LBB17_27
LBB17_25:
 cmp     esi, 65280
 je      LBB17_11
 cmp     esi, 16711680
 jne     LBB17_27
LBB17_18:
 add     rdx, 2
 pop     rbx
 pop     r14
 pop     rbp
 vzeroupper
 ret
LBB17_27:
 vpcmpgtq ymm1, ymm0, ymm1
 vpmovmskb esi, ymm1
 lea     rdx, [rcx, +, 4*rcx]
 cmp     esi, 65534
 jg      LBB17_30
 test    esi, esi
 je      LBB17_29
 cmp     esi, 255
 jne     LBB17_36
 mov     ecx, 1
 jmp     LBB17_37
LBB17_30:
 cmp     esi, 65535
 je      LBB17_33
 cmp     esi, 16777215
 jne     LBB17_36
 mov     ecx, 3
 jmp     LBB17_37
LBB17_29:
 xor     ecx, ecx
 jmp     LBB17_37
LBB17_33:
 mov     ecx, 2
 jmp     LBB17_37
LBB17_36:
 mov     ecx, 4
LBB17_37:
 add     rcx, rdx
 lea     rsi, [4*rcx]
 lea     rdx, [4*rcx, +, 24]
 vmovdqu ymm1, ymmword, ptr, [r14, +, 8*rsi, +, 192]
 vpcmpeqq ymm2, ymm0, ymm1
 vpmovmskb esi, ymm2
 cmp     esi, 65279
 jg      LBB17_40
 cmp     esi, -16777216
 je      LBB17_23
 cmp     esi, 255
 je      LBB17_19
 jmp     LBB17_42
LBB17_40:
 cmp     esi, 65280
 je      LBB17_11
 cmp     esi, 16711680
 je      LBB17_18
LBB17_42:
 vpcmpgtq ymm1, ymm0, ymm1
 vpmovmskb edx, ymm1
 lea     rcx, [rcx, +, 4*rcx]
 cmp     edx, 65534
 jg      LBB17_45
 test    edx, edx
 je      LBB17_44
 cmp     edx, 255
 jne     LBB17_51
 mov     esi, 1
 jmp     LBB17_52
LBB17_45:
 cmp     edx, 65535
 je      LBB17_48
 cmp     edx, 16777215
 jne     LBB17_51
 mov     esi, 3
 jmp     LBB17_52
LBB17_44:
 xor     esi, esi
 jmp     LBB17_52
LBB17_48:
 mov     esi, 2
 jmp     LBB17_52
LBB17_51:
 mov     esi, 4
LBB17_52:
 add     rsi, rcx
 lea     rdx, [4*rsi, +, 124]
 shl     rsi, 2
 vpcmpeqq ymm0, ymm0, ymmword, ptr, [r14, +, 8*rsi, +, 992]
 vpmovmskb ecx, ymm0
 cmp     ecx, 65279
 jg      LBB17_55
 cmp     ecx, -16777216
 je      LBB17_23
 cmp     ecx, 255
 je      LBB17_19
 jmp     LBB17_57
LBB17_23:
 add     rdx, 3
 pop     rbx
 pop     r14
 pop     rbp
 vzeroupper
 ret
LBB17_55:
 cmp     ecx, 65280
 je      LBB17_11
 cmp     ecx, 16711680
 je      LBB17_18
LBB17_57:
 xor     eax, eax
 pop     rbx
 pop     r14
 pop     rbp
 vzeroupper
 ret
Kerollmops added a change on January 28, 2021
IPLJS3L5XIUMWSJVWMUISC66OEHIZZMB6RTE2K4TJXR2HL55MLPAC
main