gpt4 book ai didi

c - 为什么 -O1 比 -O2 快

转载 作者:行者123 更新时间:2023-12-03 19:00:49 26 4
gpt4 key购买 nike

我写了一个这样的C代码:

#include <stdio.h>
#define N 19

int main(void){
int a[N];
int ans = 0;

for(int i = 0; i < N; ++i){
a[i] = 0;
}
for(;;){
int i;
++ans;
for(i = N - 1; a[i] == 2; --i){
if(i == 0){
printf("%d\n", ans);
return 0;
}else{
a[i] = 0;
}
}
++a[i];
}
}
这会计算从 0 到 2 选择 N (= 19 ) 个数字的方式,并打印方式的数量 (= 3^19 = 1,162,261,467)。
我用 gcc 编译了这段代码。 -O1 比 -O2 快。为什么 -O2 优化比 -O1 差?
  • CPU:Intel(R) Core(TM) i7-8565U,x86_64
  • 操作系统:Arch Linux (5.9.1-arch1-1)
  • 编译器:gcc (GCC) 10.2.0

  • 编辑:
    使用 -S 选项运行 gcc 生成以下汇编代码:
    -O1
        .file   "a.c"
    .text
    .section .rodata.str1.1,"aMS",@progbits,1
    .LC0:
    .string "%d\n"
    .text
    .globl main
    .type main, @function
    main:
    .LFB11:
    .cfi_startproc
    subq $104, %rsp
    .cfi_def_cfa_offset 112
    movq %fs:40, %rax
    movq %rax, 88(%rsp)
    xorl %eax, %eax
    movq %rsp, %rax
    leaq 76(%rsp), %rdx
    .L2:
    movl $0, (%rax)
    addq $4, %rax
    cmpq %rdx, %rax
    jne .L2
    movl $0, %esi
    jmp .L7
    .L4:
    movslq %edx, %rdx
    addl $1, %ecx
    movl %ecx, (%rsp,%rdx,4)
    .L7:
    addl $1, %esi
    movl 72(%rsp), %ecx
    leaq 68(%rsp), %rax
    movl $18, %edx
    cmpl $2, %ecx
    jne .L4
    .L5:
    movl $0, 4(%rax)
    subl $1, %edx
    movl (%rax), %ecx
    cmpl $2, %ecx
    jne .L4
    subq $4, %rax
    testl %edx, %edx
    jne .L5
    leaq .LC0(%rip), %rdi
    movl $0, %eax
    call printf@PLT
    movq 88(%rsp), %rax
    subq %fs:40, %rax
    jne .L14
    movl $0, %eax
    addq $104, %rsp
    .cfi_remember_state
    .cfi_def_cfa_offset 8
    ret
    .L14:
    .cfi_restore_state
    call __stack_chk_fail@PLT
    .cfi_endproc
    .LFE11:
    .size main, .-main
    .ident "GCC: (GNU) 10.2.0"
    .section .note.GNU-stack,"",@progbits
    -O2
        .file   "a.c"
    .text
    .section .rodata.str1.1,"aMS",@progbits,1
    .LC0:
    .string "%d\n"
    .section .text.startup,"ax",@progbits
    .p2align 4
    .globl main
    .type main, @function
    main:
    .LFB11:
    .cfi_startproc
    subq $104, %rsp
    .cfi_def_cfa_offset 112
    movl $9, %ecx
    xorl %esi, %esi
    movq %fs:40, %rax
    movq %rax, 88(%rsp)
    xorl %eax, %eax
    movq %rsp, %rdx
    movq %rdx, %rdi
    rep stosq
    movl $0, (%rdi)
    leaq 68(%rsp), %rdi
    .L6:
    movl 72(%rsp), %ecx
    addl $1, %esi
    movq %rdi, %rax
    movl $18, %edx
    cmpl $2, %ecx
    je .L4
    jmp .L3
    .p2align 4,,10
    .p2align 3
    .L5:
    subq $4, %rax
    testl %edx, %edx
    je .L14
    .L4:
    movl (%rax), %ecx
    movl $0, 4(%rax)
    subl $1, %edx
    cmpl $2, %ecx
    je .L5
    .L3:
    movslq %edx, %rdx
    addl $1, %ecx
    movl %ecx, (%rsp,%rdx,4)
    jmp .L6
    .p2align 4,,10
    .p2align 3
    .L14:
    xorl %eax, %eax
    leaq .LC0(%rip), %rdi
    call printf@PLT
    movq 88(%rsp), %rax
    subq %fs:40, %rax
    jne .L15
    xorl %eax, %eax
    addq $104, %rsp
    .cfi_remember_state
    .cfi_def_cfa_offset 8
    ret
    .L15:
    .cfi_restore_state
    call __stack_chk_fail@PLT
    .cfi_endproc
    .LFE11:
    .size main, .-main
    .ident "GCC: (GNU) 10.2.0"
    .section .note.GNU-stack,"",@progbits
    基准是:
    $ gcc a.c -O1
    $ time ./a.out
    1162261467

    real 0m0.895s
    user 0m0.894s
    sys 0m0.000s
    $ time ./a.out
    1162261467

    real 0m0.912s
    user 0m0.911s
    sys 0m0.000s
    $ time ./a.out
    1162261467

    real 0m0.925s
    user 0m0.924s
    sys 0m0.001s
    $ gcc a.c -O2
    $ time ./a.out
    1162261467

    real 0m1.570s
    user 0m1.568s
    sys 0m0.000s
    $ time ./a.out
    1162261467

    real 0m1.567s
    user 0m1.562s
    sys 0m0.004s
    $ time ./a.out
    1162261467

    real 0m1.576s
    user 0m1.568s
    sys 0m0.001s
    $ gcc a.c -O3
    $ time ./a.out
    1162261467

    real 0m1.613s
    user 0m1.612s
    sys 0m0.000s
    $ time ./a.out
    1162261467

    real 0m1.608s
    user 0m1.599s
    sys 0m0.003s
    $ time ./a.out
    1162261467

    real 0m1.628s
    user 0m1.628s
    sys 0m0.000s
    $ gcc a.c -Ofast
    $ time ./a.out
    1162261467

    real 0m1.571s
    user 0m1.570s
    sys 0m0.001s
    $ time ./a.out
    1162261467

    real 0m1.604s
    user 0m1.595s
    sys 0m0.004s
    $ time ./a.out
    1162261467

    real 0m1.616s
    user 0m1.613s
    sys 0m0.000s
    $ gcc a.c -O0
    $ time ./a.out
    1162261467

    real 0m2.457s
    user 0m2.456s
    sys 0m0.001s
    $ time ./a.out
    1162261467

    real 0m2.526s
    user 0m2.525s
    sys 0m0.000s
    $ time ./a.out
    1162261467

    real 0m2.565s
    user 0m2.565s
    sys 0m0.000s

    编辑:
    我编辑了这样的代码:
    #include <stdio.h>
    #define N 19

    volatile int answer;

    int main(void){
    int a[N];
    int ans = 0;

    for(int i = 0; i < N; ++i){
    a[i] = 0;
    }
    for(;;){
    int i;
    ++ans;
    for(i = N - 1; a[i] == 2; --i){
    if(i == 0){
    answer = ans;
    return 0;
    }else{
    a[i] = 0;
    }
    }
    ++a[i];
    }
    }
    并再次测量:
    $ gcc a.c -O1
    $ time ./a.out

    real 0m0.924s
    user 0m0.924s
    sys 0m0.000s
    $ time ./a.out

    real 0m0.950s
    user 0m0.949s
    sys 0m0.000s
    $ time ./a.out

    real 0m0.993s
    user 0m0.989s
    sys 0m0.004s
    $ gcc a.c -O2
    $ time ./a.out

    real 0m1.637s
    user 0m1.636s
    sys 0m0.000s
    $ time ./a.out

    real 0m1.661s
    user 0m1.656s
    sys 0m0.004s
    $ time ./a.out

    real 0m1.656s
    user 0m1.654s
    sys 0m0.001s
    编辑:
    我在 [[likely]] 之后添加了 for(;;) 属性:
    #include <stdio.h>
    #define N 19

    int main(void){
    int a[N];
    int ans = 0;

    for(int i = 0; i < N; ++i){
    a[i] = 0;
    }
    for(;;) [[likely]] {
    int i;
    ++ans;
    for(i = N - 1; a[i] == 2; --i){
    if(i == 0){
    printf("%d\n", ans);
    return 0;
    }else{
    a[i] = 0;
    }
    }
    ++a[i];
    }
    }
    然后,基准测试的结果发生了变化:
    $ g++ a.cpp -O1
    $ for i in {1..5}; do time ./a.out; done
    1162261467
    ./a.out 0.65s user 0.00s system 99% cpu 0.653 total
    1162261467
    ./a.out 0.65s user 0.00s system 99% cpu 0.657 total
    1162261467
    ./a.out 0.66s user 0.00s system 99% cpu 0.656 total
    1162261467
    ./a.out 0.66s user 0.00s system 99% cpu 0.665 total
    1162261467
    ./a.out 0.66s user 0.00s system 99% cpu 0.660 total
    $ g++ a.cpp -O2
    $ for i in {1..5}; do time ./a.out; done
    1162261467
    ./a.out 0.66s user 0.00s system 99% cpu 0.661 total
    1162261467
    ./a.out 0.65s user 0.00s system 99% cpu 0.648 total
    1162261467
    ./a.out 0.66s user 0.00s system 99% cpu 0.659 total
    1162261467
    ./a.out 0.65s user 0.00s system 99% cpu 0.654 total
    1162261467
    ./a.out 0.66s user 0.00s system 99% cpu 0.657 total
    -O2 和 -O1 一样快!谢谢@Acorn。

    最佳答案

    Why -O2 optimization was worse than -O1?


    在大多数情况下,更高的优化级别应该会给你更好的性能。不过,您可能会发现像这样的异常(exception)。特别是在像这样的微基准测试中。
    您的程序使用的代码和数据内存非常小,缓存和内存访问不太可能成为问题。但是,它是分支密集型的,这意味着它将归结为静态和动态分支预测。
    如果您的编译器出错,就像在这种情况下一样,您可以尝试通过可能/不太可能的提示或分析程序为其提供更多信息。

    关于c - 为什么 -O1 比 -O2 快,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/64909830/

    26 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com