gpt4 book ai didi

arm - Raspberry Pi 1 和 2 中 ARM11 和 Cortex-A7 内核的每个周期峰值 FLOPs

转载 作者:行者123 更新时间:2023-12-04 23:18:08 32 4
gpt4 key购买 nike

我想知道 Raspberry Pi 1 中的 ARM1176JZF-S 内核和 Raspberry Pi 2 中的 Cortex-A7 内核的每个周期的峰值 FLOP。

来自 ARM1176JZF-S Technical Reference Manual似乎 VFPv2 每个时钟周期可以做一个 SP MAC,每隔一个时钟周期做一个 DP MAC。此外,还有三个可以并行运行的管道:MAC 管道 (FMAC)、除法和 sqrt 管道 (DS) 和加载/存储管道 (LS)。基于此,Raspberry PI 1 的 ARM1176JZF-S 似乎至少可以做到(来自 FMAC 管道)

  • 1 DP FLOP/周期:1 MAC/2 周期
  • 2 SP FLOPs/cycle:1 MAC/cycle

  • Wikipedia声称树莓派 PI 1 的 FLOPS 是 0.041 DP GFLOPS .除以 0.700 GHz 得到小于 0.06 DP FLOPs/cycle。这比我估计的 1 DP FLOP/周期少了大约 17 倍。

    那么正确答案是什么呢?

    对于 Raspberry Pi 2 中的 Cortex-A7 处理器,我相信它与 Cortex-A9 相同。 The FLOPs/cycle/core for the Cortex-A9是:
  • 1.5 DP FLOPs/cycle:标量加法+标量乘法每隔一个周期
  • 4 个 SP FLOPs/周期:每隔一个周期进行 4 次宽 NEON 加法 + 每隔一个周期进行 4 次宽 NEON 乘法。

  • Raspberry Pi 2 的 FLOPs/cycle/core 是否与 Corrtex-A9 相同?如果不是,正确答案是什么?

    编辑:

    The main differences between the Cortex-A9 and Cortex-A7 (当涉及到峰值触发器/周期时)是:
  • Cortex-A9 是双发的(每个时钟两条指令),Cortex-A7 是 only partially dual-issue “A7 不能双重发出浮点或 NEON 指令。”
  • Cortex-A9 是乱序 (OoO) 处理器,而 Cortex-A7 不是。

  • 我不确定为什么 OoO 会影响峰值 FLOPS。双重问题当然应该。我认为这会将峰值 FLOPS 减少一半。

    编辑:基于表 http://hardwarebug.org/2014/05/15/cortex-a7-instruction-cycle-timings/斯蒂芬佳能在这里发表评论是我对 Cortex-A7 的新峰值失败
  • 0.5 DP FLOPs/cycle:每四个周期一个 VMLA.F64 (VFP)。
  • 1.0 DP FLOPs/cycle:每个周期一个 VADD.F64 (VFP)。
  • 2.0 SP FLOPs/cycle:每个周期一个 VMLA.F32 (VFP)。
  • 2.0 SP FLOPs/cycle:每隔一个周期在两个 32 位浮点数上有一个 VMLA.F32 (NEON)。
  • 最佳答案

    示例 1 编译代码 MP-MFLOPSPiNeon 在 900 MHz Rpi2 上获得 >647 MFLOPS(数据字 3.2k 到 3.2M)。拆解好像没有线程一样。使用的编译/链接命令和每个数据字 32 个操作的 C 代码如下[有人可能会建议更快的编译选项]。

       MP-MFLOPS Compiled NEON v1.0

    gcc mpmflops.c cpuidc.c -lrt -lc -lm -O3 -mcpu=cortex-a7
    -mfloat-abi=hard -mfpu=neon-vfpv4 -funsafe-math-optimizations -lpthread -o MP-MFLOPSPiNeon

    32 OPs/Word 1 CPU 692 MFLOPS

    void triadplus2(int n, float a, float b, float c, float d,
    float e, float f, float g, float h, float j,
    float k, float l, float m, float o, float p,
    float q, float r, float s, float t, float u,
    float v, float w, float y, float *x)
    {
    int i;
    for(i=0; i<n; i++)
    x[i] = (x[i]+a)*b-(x[i]+c)*d+(x[i]+e)*f-(x[i]+g)*h+(x[i]+j)*k
    -(x[i]+l)*m+(x[i]+o)*p-(x[i]+q)*r+(x[i]+s)*t-(x[i]+u)*v+(x[i]+w)*y;
    }

    下面是复杂的拆解。注意突出显示的融合乘法累加或减法指令,加载次数过多
     triadplus2:

    @ args = 24, pretend = 0, frame = 272
    @ frame_needed = 0, uses_anonymous_args = 0
    @ link register save eliminated.
    stmfd sp!, {r4, r5, r6, r7}
    cmp r0, #0
    fstmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15}
    sub sp, sp, #272
    flds s21, [sp, #352]
    flds s18, [sp, #356]
    flds s19, [sp, #360]
    flds s16, [sp, #364]
    flds s20, [sp, #368]
    flds s17, [sp, #372]
    ble .L57
    sbfx r3, r1, #2, #1
    and r3, r3, #3
    cmp r3, r0
    movcs r3, r0
    cmp r0, #4
    movls r3, r0
    bhi .L80
    LOOP HERE
    .L59:
    flds s23, [r1]
    cmp r3, #1
    fadds s22, s23, s4
    movls r2, #1
    fadds s24, s23, s0
    fadds s31, s23, s8
    fadds s30, s23, s12
    fmuls s22, s22, s5
    fadds s29, s23, s21
    fadds s28, s23, s20
    fadds s27, s23, s6
    vfma.f32 s22, s24, s1
    fadds s26, s23, s2
    fadds s25, s23, s10
    fadds s24, s23, s14
    fadds s23, s23, s19
    vfma.f32 s22, s31, s9
    vfma.f32 s22, s30, s13
    vfma.f32 s22, s29, s18
    vfma.f32 s22, s28, s17
    vfms.f32 s22, s27, s7
    vfms.f32 s22, s26, s3
    vfms.f32 s22, s25, s11
    vfms.f32 s22, s24, s15
    vfms.f32 s22, s23, s16
    fsts s22, [r1]
    bls .L61
    flds s23, [r1, #4]
    cmp r3, #2
    fadds s22, s23, s4
    movls r2, #2
    fadds s24, s23, s0
    fadds s31, s23, s8
    fadds s30, s23, s12
    fmuls s22, s22, s5
    fadds s29, s23, s21
    fadds s28, s23, s20
    fadds s27, s23, s6
    vfma.f32 s22, s24, s1
    fadds s26, s23, s2
    fadds s25, s23, s10
    fadds s24, s23, s14
    fadds s23, s23, s19
    vfma.f32 s22, s31, s9
    vfma.f32 s22, s30, s13
    vfma.f32 s22, s29, s18
    vfma.f32 s22, s28, s17
    vfms.f32 s22, s27, s7
    vfms.f32 s22, s26, s3
    vfms.f32 s22, s25, s11
    vfms.f32 s22, s24, s15
    vfms.f32 s22, s23, s16
    fsts s22, [r1, #4]
    bls .L61
    flds s23, [r1, #8]
    cmp r3, #3
    fadds s22, s23, s4
    movls r2, #3
    fadds s24, s23, s0
    fadds s31, s23, s8
    fadds s30, s23, s12
    fmuls s22, s22, s5
    fadds s29, s23, s21
    fadds s28, s23, s20
    fadds s27, s23, s6
    vfma.f32 s22, s24, s1
    fadds s26, s23, s2
    fadds s25, s23, s10
    fadds s24, s23, s14
    fadds s23, s23, s19
    vfma.f32 s22, s31, s9
    vfma.f32 s22, s30, s13
    vfma.f32 s22, s29, s18
    vfma.f32 s22, s28, s17
    vfms.f32 s22, s27, s7
    vfms.f32 s22, s26, s3
    vfms.f32 s22, s25, s11
    vfms.f32 s22, s24, s15
    vfms.f32 s22, s23, s16
    fsts s22, [r1, #8]
    bls .L61
    flds s23, [r1, #12]
    mov r2, #4
    fadds s22, s23, s20
    fadds s24, s23, s21
    fadds s31, s23, s12
    fadds s30, s23, s8
    fmuls s22, s22, s17
    fadds s29, s23, s4
    fadds s28, s23, s0
    fadds s27, s23, s6
    vfma.f32 s22, s24, s18
    fadds s26, s23, s2
    fadds s25, s23, s10
    fadds s24, s23, s14
    fadds s23, s23, s19
    vfma.f32 s22, s31, s13
    vfma.f32 s22, s30, s9
    vfma.f32 s22, s29, s5
    vfma.f32 s22, s28, s1
    vfms.f32 s22, s27, s7
    vfms.f32 s22, s26, s3
    vfms.f32 s22, s25, s11
    vfms.f32 s22, s24, s15
    vfms.f32 s22, s23, s16
    fsts s22, [r1, #12]
    .L61:
    cmp r3, r0
    beq .L57
    rsb r6, r3, r0
    mov r4, r6, lsr #2
    movs r7, r4, asl #2
    beq .L63
    .L81:
    vdup.32 q12, d1[1]
    vdup.32 q8, d0[0]
    vdup.32 q10, d0[1]
    vdup.32 q11, d1[0]
    vstr d24, [sp, #64]
    vstr d25, [sp, #72]
    vdup.32 q12, d3[1]
    vstr d16, [sp, #16]
    vstr d17, [sp, #24]
    vstr d20, [sp, #32]
    vstr d21, [sp, #40]
    vdup.32 q8, d2[0]
    vdup.32 q10, d2[1]
    vstr d22, [sp, #48]
    vstr d23, [sp, #56]
    vstr d24, [sp, #128]
    vstr d25, [sp, #136]
    vdup.32 q11, d3[0]
    vdup.32 q12, d5[1]
    vstr d16, [sp, #80]
    vstr d17, [sp, #88]
    vstr d20, [sp, #96]
    vstr d21, [sp, #104]
    vdup.32 q8, d4[0]
    vdup.32 q10, d4[1]
    vstr d22, [sp, #112]
    vstr d23, [sp, #120]
    vstr d24, [sp, #192]
    vstr d25, [sp, #200]
    vdup.32 q11, d5[0]
    vdup.32 q12, d10[0]
    vstr d16, [sp, #144]
    vstr d17, [sp, #152]
    vstr d20, [sp, #160]
    vstr d21, [sp, #168]
    vstr d22, [sp, #176]
    vstr d23, [sp, #184]
    vdup.32 q8, d6[0]
    vdup.32 q10, d9[1]
    vdup.32 q11, d8[0]
    vstr d24, [sp, #256]
    vstr d25, [sp, #264]
    vdup.32 q12, d8[1]
    vstr d16, [sp, #208]
    vstr d17, [sp, #216]
    vdup.32 q7, d6[1]
    vdup.32 q6, d7[0]
    vdup.32 q15, d7[1]
    vdup.32 q14, d10[1]
    vdup.32 q13, d9[0]
    vstr d20, [sp, #224]
    vstr d21, [sp, #232]
    vstr d22, [sp, #240]
    vstr d23, [sp, #248]
    vst1.64 {d24-d25}, [sp:64]
    add r3, r1, r3, asl #2
    mov ip, #0
    mov r5, r3
    .L69:

    vfma FUSED MULTIPLY ACCUMULATE or vfms SUBTRACT QUAD WORDS

    vld1.64 {d18-d19}, [r3:64]!
    vldr d20, [sp, #80]
    vldr d21, [sp, #88]
    vldr d22, [sp, #16]
    vldr d23, [sp, #24]
    vadd.f32 q8, q9, q10
    vldr d24, [sp, #96]
    vldr d25, [sp, #104]
    vadd.f32 q10, q9, q11
    vmul.f32 q8, q8, q12
    vldr d22, [sp, #32]
    vldr d23, [sp, #40]
    vldr d24, [sp, #144]
    vldr d25, [sp, #152]
    vfma.f32 q8, q10, q11
    add ip, ip, #1
    vadd.f32 q11, q9, q12
    vldr d24, [sp, #208]
    vldr d25, [sp, #216]
    cmp r4, ip
    vadd.f32 q10, q9, q12
    vldr d24, [sp, #160]
    vldr d25, [sp, #168]
    vfma.f32 q8, q11, q12
    vadd.f32 q11, q9, q14
    vldr d24, [sp, #256]
    vldr d25, [sp, #264]
    vfma.f32 q8, q10, q7
    vadd.f32 q10, q9, q12
    vldr d24, [sp, #112]
    vldr d25, [sp, #120]
    vfma.f32 q8, q11, q13
    vadd.f32 q11, q9, q12
    vld1.64 {d24-d25}, [sp:64]
    vfma.f32 q8, q10, q12
    vldr d24, [sp, #48]
    vldr d25, [sp, #56]
    vadd.f32 q10, q9, q12
    vldr d24, [sp, #128]
    vldr d25, [sp, #136]
    vfms.f32 q8, q11, q12
    vldr d24, [sp, #176]
    vldr d25, [sp, #184]
    vadd.f32 q11, q9, q12
    vldr d24, [sp, #64]
    vldr d25, [sp, #72]
    vfms.f32 q8, q10, q12
    vldr d24, [sp, #224]
    vldr d25, [sp, #232]
    vadd.f32 q10, q9, q6
    vadd.f32 q9, q9, q12
    vldr d24, [sp, #192]
    vldr d25, [sp, #200]
    vfms.f32 q8, q11, q12
    vfms.f32 q8, q10, q15
    vldr d20, [sp, #240]
    vldr d21, [sp, #248]
    vfms.f32 q8, q9, q10
    vst1.64 {d16-d17}, [r5:64]!
    bhi .L69

    END vfma FUSED MULTIPLY ACCUMULATE or vfms SUBTRACT QUAD WORDS

    cmp r7, r6
    add r2, r2, r7
    beq .L57
    .L63:
    add ip, r1, r2, asl #2
    add r3, r2, #1
    cmp r0, r3
    flds s23, [ip]
    fadds s22, s23, s4
    fadds s24, s23, s0
    fadds s31, s23, s8
    fadds s30, s23, s12
    fmuls s22, s22, s5
    fadds s29, s23, s21
    fadds s28, s23, s20
    fadds s27, s23, s2
    vfma.f32 s22, s24, s1
    fadds s26, s23, s6
    fadds s25, s23, s14
    fadds s24, s23, s10
    fadds s23, s23, s19
    vfma.f32 s22, s31, s9
    vfma.f32 s22, s30, s13
    vfma.f32 s22, s29, s18
    vfma.f32 s22, s28, s17
    vfms.f32 s22, s27, s3
    vfms.f32 s22, s26, s7
    vfms.f32 s22, s25, s15
    vfms.f32 s22, s24, s11
    vfms.f32 s22, s23, s16
    fsts s22, [ip]
    ble .L57
    add r3, r1, r3, asl #2
    add r2, r2, #2
    cmp r0, r2
    flds s23, [r3]
    fadds s22, s23, s4
    fadds s24, s23, s0
    fadds s31, s23, s8
    fadds s30, s23, s12
    fmuls s22, s22, s5
    fadds s29, s23, s21
    fadds s28, s23, s20
    fadds s27, s23, s6
    vfma.f32 s22, s24, s1
    fadds s26, s23, s2
    fadds s25, s23, s10
    fadds s24, s23, s14
    fadds s23, s23, s19
    vfma.f32 s22, s31, s9
    vfma.f32 s22, s30, s13
    vfma.f32 s22, s29, s18
    vfma.f32 s22, s28, s17
    vfms.f32 s22, s27, s7
    vfms.f32 s22, s26, s3
    vfms.f32 s22, s25, s11
    vfms.f32 s22, s24, s15
    vfms.f32 s22, s23, s16
    fsts s22, [r3]
    ble .L57
    add r2, r1, r2, asl #2
    flds s22, [r2]
    fadds s4, s22, s4
    fadds s0, s22, s0
    fadds s8, s22, s8
    fadds s12, s22, s12
    fmuls s5, s4, s5
    fadds s21, s22, s21
    fadds s20, s22, s20
    fadds s6, s22, s6
    vfma.f32 s5, s0, s1
    fadds s2, s22, s2
    fadds s10, s22, s10
    fadds s14, s22, s14
    fadds s19, s22, s19
    vfma.f32 s5, s8, s9
    vfma.f32 s5, s12, s13
    vfma.f32 s5, s21, s18
    vfma.f32 s5, s20, s17
    vfms.f32 s5, s6, s7
    vfms.f32 s5, s2, s3
    vfms.f32 s5, s10, s11
    vfms.f32 s5, s14, s15
    vfms.f32 s5, s19, s16
    fsts s5, [r2]
    .L57:
    add sp, sp, #272
    @ sp needed
    fldmfdd sp!, {d8-d15}
    ldmfd sp!, {r4, r5, r6, r7}
    bx lr
    .L80:
    cmp r3, #0
    moveq r2, r3
    bne .L59

    rsb r6, r3, r0
    mov r4, r6, lsr #2
    movs r7, r4, asl #2
    bne .L81
    b .L63
    .size triadplus2, .-triadplus2

    示例 2 - 使用 NEON 内在函数(在我知道融合指令之前)> 700 MFLOPS。第一个C代码:
     32 Operations per word
    C NEON Intrinsics
    n = words 3.2k, 32k, 3.2M
    similar results > 700 MFLOPS.

    for(i=0; i<n; i=i+4)
    {
    x41 = vld1q_f32(ptrx1);

    z41 = vaddq_f32(x41, a41);
    z41 = vmulq_f32(z41, b41);

    z42 = vaddq_f32(x41, c41);
    z42 = vmulq_f32(z42, d41);
    z41 = vsubq_f32(z41, z42);

    z42 = vaddq_f32(x41, e41);
    z42 = vmulq_f32(z42, f41);
    z41 = vaddq_f32(z41, z42);

    z42 = vaddq_f32(x41, g41);
    z42 = vmulq_f32(z42, h41);
    z41 = vsubq_f32(z41, z42);

    z42 = vaddq_f32(x41, j41);
    z42 = vmulq_f32(z42, k41);
    z41 = vaddq_f32(z41, z42);

    z42 = vaddq_f32(x41, l41);
    z42 = vmulq_f32(z42, m41);
    z41 = vsubq_f32(z41, z42);

    z42 = vaddq_f32(x41, o41);
    z42 = vmulq_f32(z42, p41);
    z41 = vaddq_f32(z41, z42);

    z42 = vaddq_f32(x41, q41);
    z42 = vmulq_f32(z42, r41);
    z41 = vsubq_f32(z41, z42);

    z42 = vaddq_f32(x41, s41);
    z42 = vmulq_f32(z42, t41);
    z41 = vaddq_f32(z41, z42);

    z42 = vaddq_f32(x41, u41);
    z42 = vmulq_f32(z42, v41);
    z41 = vsubq_f32(z41, z42);

    z42 = vaddq_f32(x41, w41);
    z42 = vmulq_f32(z42, y41);
    z41 = vaddq_f32(z41, z42);

    vst1q_f32(ptrx1, z41);

    ptrx1 = ptrx1 + 4;
    }

    接下来是拆解,再次使用重载指令。
    Assembly Code

    .L26:
    vld1.32 {d16-d17}, [ip]
    vld1.64 {d20-d21}, [sp:64]
    vadd.f32 q9, q8, q14
    vadd.f32 q11, q8, q10
    vldr d24, [sp, #16]
    vldr d25, [sp, #24]
    vmul.f32 q11, q11, q13
    vmul.f32 q9, q9, q12
    vldr d24, [sp, #32]
    vldr d25, [sp, #40]
    vsub.f32 q11, q11, q9
    vadd.f32 q10, q8, q12
    vldr d18, [sp, #48]
    vldr d19, [sp, #56]
    vldr d24, [sp, #64]
    vldr d25, [sp, #72]
    vmul.f32 q10, q10, q9
    vadd.f32 q9, q8, q12
    vadd.f32 q11, q11, q10
    vldr d20, [sp, #80]
    vldr d21, [sp, #88]
    vldr d24, [sp, #96]
    vldr d25, [sp, #104]
    vmul.f32 q9, q9, q10
    vadd.f32 q10, q8, q12
    vsub.f32 q11, q11, q9
    vldr d18, [sp, #112]
    vldr d19, [sp, #120]
    vldr d24, [sp, #128]
    vldr d25, [sp, #136]
    vmul.f32 q10, q10, q9
    vadd.f32 q9, q8, q12
    vadd.f32 q11, q11, q10
    vldr d24, [sp, #160]
    vldr d25, [sp, #168]
    vldr d20, [sp, #144]
    vldr d21, [sp, #152]
    add r3, r3, #4
    cmp r0, r3
    vmul.f32 q9, q9, q10
    vadd.f32 q10, q8, q12
    vsub.f32 q11, q11, q9
    vmul.f32 q10, q10, q15
    vadd.f32 q9, q8, q3
    vadd.f32 q11, q11, q10
    vmul.f32 q9, q9, q2
    vadd.f32 q10, q8, q1
    vsub.f32 q11, q11, q9
    vmul.f32 q10, q10, q0
    vadd.f32 q9, q8, q4
    vadd.f32 q10, q11, q10
    vmul.f32 q9, q9, q5
    vadd.f32 q8, q8, q6
    vsub.f32 q10, q10, q9
    vmul.f32 q8, q8, q7
    vadd.f32 q10, q10, q8
    vst1.32 {d20-d21}, [ip]!
    bgt .L26

    关于arm - Raspberry Pi 1 和 2 中 ARM11 和 Cortex-A7 内核的每个周期峰值 FLOPs,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30976071/

    32 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com