gpt4 book ai didi

performance - 为什么具有不同整数参数大小的相同函数更快?

转载 作者:行者123 更新时间:2023-11-29 07:46:55 25 4
gpt4 key购买 nike

From Mathematics to Generic Programming 这本书的启发,我正在研究函数和不同的整数大小。

我有两个不同的素数筛实现,它们可以与 u16u32 一起使用。我用 cargo bench 对它们进行基准测试,u16 方法总是比 u32 方法快一点。

为什么会这样?我的假设是我的处理器 (i5-7300u) 能够同时为 u16 执行两次添加,但不能为 u32u64 执行。然而,我不知道如何验证这一点。我附上了程序集。

基准测试结果

test tests::bench_sift2 ... bench: 74,093 ns/iter (+/- 3,765)

test tests::bench_sift2_u16 ... bench: 61,136 ns/iter (+/- 3,389)

编辑

尝试使用不同的数组大小以及使用 bool 数组而不是向量的想法会产生大约。两种功能的速度相同。实际上,只有当两个向量的大小都为 1<<15 时,性能差异才会显着。

编辑 2

一些有趣的观察:我在装有 Windows 10 Pro 10.0.1 的 Windows Surface 计算机上运行此代码。或多或少是偶然的,我只是用不同的节能配置运行基准测试。当我将配置设置为最高性能时,我或多或少会看到下面报告的结果。如果我将配置设置为任何其他级别,我会看到两个函数的行为似乎相同的结果,但测量误差会急剧增加。

Rust 代码

#![feature(iterator_step_by)]
#![feature(test)]

extern crate test;

fn main() {
let vec = sift2(1 << 15);
// let vec = sift2_u16(1 << 15);
println!("{}",vec[0]);
}


fn sift2(n: usize) -> Vec<bool> {
let mut vec = vec![true; n];

let mut i = 0;
let mut index_square = 3;
let mut factor = 3;

while index_square < n {
if vec[i] {
mark_sieve(&mut vec[index_square..], factor);
}
i += 1;

index_square += factor;
factor += 2;
index_square += factor;
}

vec
}

fn sift2_u16(n: u16) -> Vec<bool> {
let mut vec = vec![true; n as usize];

let mut i: u16 = 0;
let mut index_square: u16 = 3;
let mut factor: u16 = 3;

while index_square < n {
if vec[i as usize] {
mark_sieve(&mut vec[index_square as usize..], factor as usize);
}
i += 1;

index_square += factor;
factor += 2;
index_square += factor;
}

vec
}

fn mark_sieve(data: &mut [bool], factor: usize) {
data.iter_mut().step_by(factor).for_each(|k| *k = false);
}

#[cfg(test)]
mod tests {

use super::*;
use test::{black_box, Bencher};

#[bench]
fn bench_sift2(b: &mut Bencher) {
b.iter(|| sift2(1 << 15));
}

#[bench]
fn bench_sift2_u16(b: &mut Bencher) {
b.iter(|| sift2_u16(1 << 15));
}
}

为 sift2 生成的程序集

    .text
.def _ZN3std2rt10lang_start17h0092a1d276f89f87E;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.globl _ZN3std2rt10lang_start17h0092a1d276f89f87E
.p2align 4, 0x90
_ZN3std2rt10lang_start17h0092a1d276f89f87E:
.seh_proc _ZN3std2rt10lang_start17h0092a1d276f89f87E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %r8, %r9
movq %rdx, %rax
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r8
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.seh_endproc

.def _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.p2align 4, 0x90
_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E:
.seh_proc _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *(%rcx)
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.seh_endproc

.def _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.p2align 4, 0x90
_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE:
.seh_proc _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *%rcx
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.seh_endproc

.def _ZN4core3ptr13drop_in_place17h98ac405189abf599E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17h98ac405189abf599E
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17h98ac405189abf599E:
movq 8(%rcx), %rdx
testq %rdx, %rdx
je .LBB3_1
movq (%rcx), %rcx
movl $1, %r8d
jmp __rust_dealloc
.LBB3_1:
retq

.def _ZN4core3ptr13drop_in_place17hd909dec568d984beE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17hd909dec568d984beE
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17hd909dec568d984beE:
retq

.def _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.p2align 4, 0x90
_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE:
.seh_proc _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq __rust_oom
ud2
.seh_handlerdata
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.seh_endproc

.def _ZN8chapter34main17hfb06448c1bac2398E;
.scl 3;
.type 32;
.endef
.globl __xmm@00000000000080000000000000008000
.section .rdata,"dr",discard,__xmm@00000000000080000000000000008000
.p2align 4
__xmm@00000000000080000000000000008000:
.quad 32768
.quad 32768
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 4, 0x90
_ZN8chapter34main17hfb06448c1bac2398E:
.Lfunc_begin0:
.seh_proc _ZN8chapter34main17hfb06448c1bac2398E
.seh_handler __CxxFrameHandler3, @unwind, @except
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $128, %rsp
.seh_stackalloc 128
leaq 128(%rsp), %rbp
.seh_setframe 5, 128
.seh_endprologue
movq $-2, -8(%rbp)
leaq -56(%rbp), %r8
movl $32768, %ecx
movl $1, %edx
callq __rust_alloc
movq %rax, %rdi
testq %rdi, %rdi
je .LBB6_21
movl $32768, %r14d
movl $1, %edx
movl $32768, %r8d
movq %rdi, %rcx
callq memset
movq %rdi, -56(%rbp)
movaps __xmm@00000000000080000000000000008000(%rip), %xmm0
movups %xmm0, -48(%rbp)
xorl %edx, %edx
movl $3, %eax
movl $3, %ecx
cmpb $0, (%rdi,%rdx)
jne .LBB6_3
jmp .LBB6_10
.p2align 4, 0x90
.LBB6_12:
addq $2, %rax
movq -56(%rbp), %rdi
cmpb $0, (%rdi,%rdx)
je .LBB6_10
.LBB6_3:
cmpq %rcx, %r14
jb .LBB6_4
cmpq %rcx, %r14
je .LBB6_10
addq %rdi, %r14
leaq (%rdi,%rcx), %rdi
leaq -1(%rax), %rsi
addq $1, %rdi
.p2align 4, 0x90
.LBB6_9:
movb $0, -1(%rdi)
movq %r14, %rbx
subq %rdi, %rbx
addq %rax, %rdi
cmpq %rsi, %rbx
ja .LBB6_9
.LBB6_10:
addq %rax, %rcx
addq %rax, %rcx
addq $2, %rcx
cmpq $32767, %rcx
ja .LBB6_14
addq $1, %rdx
movq -40(%rbp), %r14
cmpq %rdx, %r14
ja .LBB6_12
.Ltmp6:
leaq panic_bounds_check_loc.j(%rip), %rcx
movq %r14, %r8
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp7:
jmp .LBB6_6
.LBB6_14:
movq -40(%rbp), %rax
movq %rax, -64(%rbp)
movups -56(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
cmpq $0, -64(%rbp)
je .LBB6_15
movq -80(%rbp), %rsi
movq %rsi, -96(%rbp)
leaq _ZN43_$LT$bool$u20$as$u20$core..fmt..Display$GT$3fmt17h27a33a0bff6802a9E(%rip), %rax
movq %rax, -88(%rbp)
leaq ref.m(%rip), %rax
movq %rax, -56(%rbp)
movq $2, -48(%rbp)
leaq ref.n(%rip), %rax
movq %rax, -40(%rbp)
movq $1, -32(%rbp)
leaq -96(%rbp), %rax
movq %rax, -24(%rbp)
movq $1, -16(%rbp)
.Ltmp2:
leaq -56(%rbp), %rcx
callq _ZN3std2io5stdio6_print17h38a18b84d105804dE
.Ltmp3:
movq -72(%rbp), %rdx
testq %rdx, %rdx
je .LBB6_19
movl $1, %r8d
movq %rsi, %rcx
callq __rust_dealloc
.LBB6_19:
nop
addq $128, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.LBB6_4:
.Ltmp0:
movq %r14, %rdx
callq _ZN4core5slice22slice_index_order_fail17hbd1edce8e1fe586aE
.Ltmp1:
.LBB6_6:
ud2
.LBB6_21:
movups -48(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
movaps -80(%rbp), %xmm0
movups %xmm0, -48(%rbp)
leaq -56(%rbp), %rcx
callq _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
ud2
.LBB6_15:
.Ltmp4:
leaq panic_bounds_check_loc.j(%rip), %rcx
xorl %edx, %edx
xorl %r8d, %r8d
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp5:
jmp .LBB6_6
.seh_handlerdata
.long ($cppxdata$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_13:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -56(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_20:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -80(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.Lfunc_end0:
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.section .xdata,"dr",associative,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 2
$cppxdata$_ZN8chapter34main17hfb06448c1bac2398E:
.long 429065506
.long 2
.long ($stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 0
.long 0
.long 6
.long ($ip2state$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 120
.long 0
.long 1
$stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E:
.long -1
.long "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
.long -1
.long "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
$ip2state$_ZN8chapter34main17hfb06448c1bac2398E:
.long .Lfunc_begin0@IMGREL
.long -1
.long .Ltmp6@IMGREL+1
.long 0
.long .Ltmp2@IMGREL+1
.long 1
.long .Ltmp0@IMGREL+1
.long 0
.long .Ltmp4@IMGREL+1
.long 1
.long .Ltmp5@IMGREL+1
.long -1
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E

.def main;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,main
.globl main
.p2align 4, 0x90
main:
.seh_proc main
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %rdx, %rax
movslq %ecx, %r8
leaq _ZN8chapter34main17hfb06448c1bac2398E(%rip), %rcx
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r9
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,main
.seh_endproc

.section .rdata,"dr",one_only,vtable.4
.p2align 3
vtable.4:
.quad _ZN4core3ptr13drop_in_place17hd909dec568d984beE
.quad 8
.quad 8
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE

.section .rdata,"dr",one_only,str.i
.p2align 4
str.i:
.ascii "C:\\projects\\rust\\src\\liballoc\\vec.rs"

.section .rdata,"dr",one_only,panic_bounds_check_loc.j
.p2align 3
panic_bounds_check_loc.j:
.quad str.i
.quad 36
.long 1551
.long 10

.section .rdata,"dr",one_only,str.k
str.k:

.section .rdata,"dr",one_only,str.l
str.l:
.byte 10

.section .rdata,"dr",one_only,ref.m
.p2align 3
ref.m:
.quad str.k
.quad 0
.quad str.l
.quad 1

.section .rdata,"dr",one_only,ref.n
.p2align 3
ref.n:
.quad 1
.quad 0
.quad 3
.zero 8
.quad 3
.zero 8
.long 32
.long 0
.byte 3
.zero 7

为 sift2_u16 生成的程序集

U16
.text
.def _ZN3std2rt10lang_start17h0092a1d276f89f87E;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.globl _ZN3std2rt10lang_start17h0092a1d276f89f87E
.p2align 4, 0x90
_ZN3std2rt10lang_start17h0092a1d276f89f87E:
.seh_proc _ZN3std2rt10lang_start17h0092a1d276f89f87E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %r8, %r9
movq %rdx, %rax
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r8
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.seh_endproc

.def _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.p2align 4, 0x90
_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E:
.seh_proc _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *(%rcx)
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.seh_endproc

.def _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.p2align 4, 0x90
_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE:
.seh_proc _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *%rcx
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.seh_endproc

.def _ZN4core3ptr13drop_in_place17h98ac405189abf599E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17h98ac405189abf599E
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17h98ac405189abf599E:
movq 8(%rcx), %rdx
testq %rdx, %rdx
je .LBB3_1
movq (%rcx), %rcx
movl $1, %r8d
jmp __rust_dealloc
.LBB3_1:
retq

.def _ZN4core3ptr13drop_in_place17hd909dec568d984beE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17hd909dec568d984beE
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17hd909dec568d984beE:
retq

.def _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.p2align 4, 0x90
_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE:
.seh_proc _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq __rust_oom
ud2
.seh_handlerdata
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.seh_endproc

.def _ZN8chapter34main17hfb06448c1bac2398E;
.scl 3;
.type 32;
.endef
.globl __xmm@00000000000080000000000000008000
.section .rdata,"dr",discard,__xmm@00000000000080000000000000008000
.p2align 4
__xmm@00000000000080000000000000008000:
.quad 32768
.quad 32768
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 4, 0x90
_ZN8chapter34main17hfb06448c1bac2398E:
.Lfunc_begin0:
.seh_proc _ZN8chapter34main17hfb06448c1bac2398E
.seh_handler __CxxFrameHandler3, @unwind, @except
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $128, %rsp
.seh_stackalloc 128
leaq 128(%rsp), %rbp
.seh_setframe 5, 128
.seh_endprologue
movq $-2, -8(%rbp)
leaq -56(%rbp), %r8
movl $32768, %ecx
movl $1, %edx
callq __rust_alloc
movq %rax, %r14
testq %r14, %r14
je .LBB6_23
movl $32768, %edi
movl $1, %edx
movl $32768, %r8d
movq %r14, %rcx
callq memset
movq %r14, -56(%rbp)
movaps __xmm@00000000000080000000000000008000(%rip), %xmm0
movups %xmm0, -48(%rbp)
movw $3, %r8w
xorl %edx, %edx
movw $3, %r9w
cmpb $0, (%r14,%rdx)
jne .LBB6_3
jmp .LBB6_12
.p2align 4, 0x90
.LBB6_14:
movq -56(%rbp), %r14
cmpb $0, (%r14,%rdx)
je .LBB6_12
.LBB6_3:
movzwl %r9w, %ecx
cmpq %rcx, %rdi
jb .LBB6_4
testw %r8w, %r8w
je .LBB6_8
cmpq %rcx, %rdi
je .LBB6_12
addq %r14, %rcx
movzwl %r8w, %ebx
addq %r14, %rdi
leaq -1(%rbx), %rax
addq $1, %rcx
.p2align 4, 0x90
.LBB6_11:
movb $0, -1(%rcx)
movq %rdi, %rsi
subq %rcx, %rsi
addq %rbx, %rcx
cmpq %rax, %rsi
ja .LBB6_11
.LBB6_12:
addl %r8d, %r9d
addl $2, %r8d
addw %r8w, %r9w
js .LBB6_16
addq $1, %rdx
movq -40(%rbp), %rdi
cmpq %rdx, %rdi
ja .LBB6_14
.Ltmp8:
leaq panic_bounds_check_loc.j(%rip), %rcx
movq %rdi, %r8
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp9:
jmp .LBB6_6
.LBB6_16:
movq -40(%rbp), %rax
movq %rax, -64(%rbp)
movups -56(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
cmpq $0, -64(%rbp)
je .LBB6_17
movq -80(%rbp), %rsi
movq %rsi, -96(%rbp)
leaq _ZN43_$LT$bool$u20$as$u20$core..fmt..Display$GT$3fmt17h27a33a0bff6802a9E(%rip), %rax
movq %rax, -88(%rbp)
leaq ref.m(%rip), %rax
movq %rax, -56(%rbp)
movq $2, -48(%rbp)
leaq ref.n(%rip), %rax
movq %rax, -40(%rbp)
movq $1, -32(%rbp)
leaq -96(%rbp), %rax
movq %rax, -24(%rbp)
movq $1, -16(%rbp)
.Ltmp4:
leaq -56(%rbp), %rcx
callq _ZN3std2io5stdio6_print17h38a18b84d105804dE
.Ltmp5:
movq -72(%rbp), %rdx
testq %rdx, %rdx
je .LBB6_21
movl $1, %r8d
movq %rsi, %rcx
callq __rust_dealloc
.LBB6_21:
nop
addq $128, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.LBB6_4:
.Ltmp2:
movq %rdi, %rdx
callq _ZN4core5slice22slice_index_order_fail17hbd1edce8e1fe586aE
.Ltmp3:
jmp .LBB6_6
.LBB6_8:
.Ltmp0:
leaq ref.b(%rip), %rcx
callq _ZN4core9panicking5panic17h42feaa2e0dc2c607E
.Ltmp1:
.LBB6_6:
ud2
.LBB6_23:
movups -48(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
movaps -80(%rbp), %xmm0
movups %xmm0, -48(%rbp)
leaq -56(%rbp), %rcx
callq _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
ud2
.LBB6_17:
.Ltmp6:
leaq panic_bounds_check_loc.j(%rip), %rcx
xorl %edx, %edx
xorl %r8d, %r8d
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp7:
jmp .LBB6_6
.seh_handlerdata
.long ($cppxdata$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_15:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -56(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_22:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -80(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.Lfunc_end0:
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.section .xdata,"dr",associative,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 2
$cppxdata$_ZN8chapter34main17hfb06448c1bac2398E:
.long 429065506
.long 2
.long ($stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 0
.long 0
.long 6
.long ($ip2state$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 120
.long 0
.long 1
$stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E:
.long -1
.long "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
.long -1
.long "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
$ip2state$_ZN8chapter34main17hfb06448c1bac2398E:
.long .Lfunc_begin0@IMGREL
.long -1
.long .Ltmp8@IMGREL+1
.long 0
.long .Ltmp4@IMGREL+1
.long 1
.long .Ltmp2@IMGREL+1
.long 0
.long .Ltmp6@IMGREL+1
.long 1
.long .Ltmp7@IMGREL+1
.long -1
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E

.def main;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,main
.globl main
.p2align 4, 0x90
main:
.seh_proc main
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %rdx, %rax
movslq %ecx, %r8
leaq _ZN8chapter34main17hfb06448c1bac2398E(%rip), %rcx
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r9
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,main
.seh_endproc

.section .rdata,"dr",one_only,vtable.4
.p2align 3
vtable.4:
.quad _ZN4core3ptr13drop_in_place17hd909dec568d984beE
.quad 8
.quad 8
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE

.section .rdata,"dr",one_only,str.9
.p2align 4
str.9:
.ascii "assertion failed: step != 0"

.section .rdata,"dr",one_only,str.a
.p2align 4
str.a:
.ascii "libcore\\iter\\iterator.rs"

.section .rdata,"dr",one_only,ref.b
.p2align 3
ref.b:
.quad str.9
.quad 27
.quad str.a
.quad 24
.long 299
.long 9

.section .rdata,"dr",one_only,str.i
.p2align 4
str.i:
.ascii "C:\\projects\\rust\\src\\liballoc\\vec.rs"

.section .rdata,"dr",one_only,panic_bounds_check_loc.j
.p2align 3
panic_bounds_check_loc.j:
.quad str.i
.quad 36
.long 1551
.long 10

.section .rdata,"dr",one_only,str.k
str.k:

.section .rdata,"dr",one_only,str.l
str.l:
.byte 10

.section .rdata,"dr",one_only,ref.m
.p2align 3
ref.m:
.quad str.k
.quad 0
.quad str.l
.quad 1

.section .rdata,"dr",one_only,ref.n
.p2align 3
ref.n:
.quad 1
.quad 0
.quad 3
.zero 8
.quad 3
.zero 8
.long 32
.long 0
.byte 3
.zero 7

最佳答案

我从未尝试过 Rust,但我知道一些进行此类性能分析的好工具。因此,虽然它可能无法完全回答您的问题,但您将获得工具来深入研究这个问题。

当试图理解低级性能时,您必须查看生成的程序集,您似乎已经通过提供的输出完成了该程序集。然而,这是非常不可读的。这就是我的偏好工具出现在游戏中的地方:Compiler Explorer .可以看到your code here

从生成的程序集中,我们看到了一些差异。让我们只关注循环的 2 个元素(您可以检查其他元素,但想法和结果是相同的)

while 32 位测试:

mov     rax, qword ptr [rbp - 112]
cmp qword ptr [rbp - 64], rax
jb .LBB124_5

在 16 位中也是如此:

mov     ax, word ptr [rbp - 98]
cmp word ptr [rbp - 52], ax
jb .LBB125_5

ma​​rk_sieve 32:

.LBB124_8:
mov rax, qword ptr [rbp - 64]
mov qword ptr [rbp - 48], rax
mov rsi, qword ptr [rbp - 48]
lea rdi, [rbp - 96]
call <alloc::vec::Vec<T> as core::ops::index::IndexMut<core::ops::range::RangeFrom<usize>>>::index_mut
mov qword ptr [rbp - 136], rdx
mov qword ptr [rbp - 144], rax
jmp .LBB124_9
.LBB124_9:
mov rdx, qword ptr [rbp - 56]
mov rdi, qword ptr [rbp - 144]
mov rsi, qword ptr [rbp - 136]
call example::mark_sieve
jmp .LBB124_10

在 16:

.LBB125_8:
movzx eax, word ptr [rbp - 52]
mov ecx, eax
mov qword ptr [rbp - 48], rcx
mov rsi, qword ptr [rbp - 48]
lea rdi, [rbp - 80]
call <alloc::vec::Vec<T> as core::ops::index::IndexMut<core::ops::range::RangeFrom<usize>>>::index_mut
mov qword ptr [rbp - 120], rdx
mov qword ptr [rbp - 128], rax
jmp .LBB125_9
.LBB125_9:
movzx eax, word ptr [rbp - 50]
mov edx, eax
mov rdi, qword ptr [rbp - 128]
mov rsi, qword ptr [rbp - 120]
call example::mark_sieve
jmp .LBB125_10

我们可以在这段代码中看到一些不同之处:

  • u16 代码可以使用 16 位(ax、cx、dc)或 32 位(eax、ecx、edx)寄存器,而 u32 代码只使用 64 位(rax、rcd、rdx)。
  • u16 代码使用 movzx 而不是 mov 来取消引用指针。
  • u16 读取 16 位或内存(字),而 u32 读取 64 位(qword)。

对于这些不同的指令,您可以查看优秀的 Agner Instruction Tables 查看它们的相对执行时间差异. (顺便说一句,我很乐意看到它们集成到编译器资源管理器中)。您的 CPU 似乎是 KabyLake(因此我们将使用 Skylake 架构),因此我们将使用第 231 页开始的表格(阅读此页面以获取表格中使用的缩写)。

根据 Agner 表,mov r64,mmovzx r,m 之间的微指令数相同(1 微指令),但 mov 增加了 2潜伏期周期。

不同的累加器也会改变处理器所做的一些优化。

编译器还会执行其他依赖于架构的优化,例如根据 CPU 上可用的 ALU 数量展开循环。因此,根据编译器的决定,您的代码在不同 CPU 之间的行为可能会有所不同。

差异也可能是由于代码对齐或缓存优化所致。

关于电源管理差异,这可能受到 2 个因素的影响:频率上限和 C 状态管理。 cstates 是 cpu 在短时间内进入不同 sleep 状态的状态。 sleep /唤醒对 cpu 内部结构的影响取决于固件。所以这不是我们可以真正详细检查的东西(分析也会改变结果)。

我做了 a post about understanding Meltdown/Spectre attack ,这解释了 CPU 可以在引擎盖下进行的不同优化(甚至汇编也无法反射(reflect))。您可能还想看一下它以更好地理解为什么 CPU 优化很困难,因为我们无法控制很多参数。

祝你黑客愉快!

关于performance - 为什么具有不同整数参数大小的相同函数更快?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/49084635/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com