gpt4 book ai didi

cuda - NVIDIA CUDA 编译器自动展开循环

转载 作者:行者123 更新时间:2023-12-05 01:08:10 25 4
gpt4 key购买 nike

我不知道 NVCC 是否足够智能,可以在这样的循环中自动公开指令级并行 (ILP):

for (int i = 0; i < 8; i++) {
if (somethingHappens) {
someVar = someVar & 1 << i;
}
}

或者我应该重写它以像这样明确地暴露 ILP:
char somevar[8];
for (int i = 0; i < 8; i++) {
if (somethingHappens) {
someVar[i] = 1 << i;
}
}
//reduce somevar using vaddus4 and 3 logical-ands

其他问题:
  • 开普勒的算术管道有多深?
  • 我怎样才能有效地采取措施来了解这种优化是否值得?在块之前和块之后读取时钟寄存器就足够了吗?
  • 最佳答案

    为了回答您的问题,我正在考虑四个不同的内核,其中每个线程执行 for循环 n_loop迭代。这四个内核实现了四种不同的可能情况:

  • 迭代次数n_loop在编译时已知;
  • 迭代次数n_loop在编译时已知并且求和是有条件的;
  • 迭代次数n_loop在运行时已知;
  • 迭代次数n_loop在运行时已知,并执行手动循环展开。

  • 完整代码如下:
    #include <stdio.h>
    #include <time.h>

    #define BLOCKSIZE 512

    #define epsilon 0.5
    #define n_loop 8

    /**********/
    /* iDivUp */
    /**********/
    int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

    /********************/
    /* CUDA ERROR CHECK */
    /********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
    if (code != cudaSuccess)
    {
    fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
    if (abort) exit(code);
    }
    }

    /****************************************************/
    /* KERNEL #1: NUMBER OF LOOPS KNOWN AT COMPILE-TIME */
    /****************************************************/
    __global__ void testKernel1(float* input, float* output, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

    float accum = 0.f;

    for (int i = 0; i < n_loop; i++) {
    accum = accum + input[n_loop*tid+i];
    }

    output[tid] = accum;

    }

    }

    /****************************************************/
    /* KERNEL #2: NUMBER OF LOOPS KNOWN AT COMPILE-TIME */
    /****************************************************/
    __global__ void testKernel2(float* input, float* output, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

    float accum = 0.f;

    for (int i = 0; i < n_loop; i++) if (input[n_loop*tid+i] < epsilon) accum = accum + input[n_loop*tid+i];

    output[tid] = accum;

    }

    }

    /************************************************/
    /* KERNEL #3: NUMBER OF LOOPS KNOWN AT RUN-TIME */
    /************************************************/
    __global__ void testKernel3(float* input, float* output, int N_loop, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

    float accum = 0.f;

    for (int i = 0; i < N_loop; i++) accum = accum + input[N_loop*tid+i];

    output[tid] = accum;

    }

    }

    /*******************************************************************/
    /* KERNEL #4: NUMBER OF LOOPS KNOWN AT RUN-TIME - LOOP UNROLL OF 4 */
    /*******************************************************************/
    __global__ void testKernel4(float* input, float* output, int N_loop, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

    float accum1 = 0.f;
    float accum2 = 0.f;
    float accum3 = 0.f;
    float accum4 = 0.f;

    for (int i = 0; i < N_loop/4; i++) {
    accum1 = accum1 + input[N_loop*tid+i];
    accum2 = accum2 + input[N_loop*tid+i+N_loop/4];
    accum3 = accum3 + input[N_loop*tid+i+2*N_loop/4];
    accum4 = accum4 + input[N_loop*tid+i+3*N_loop/4];
    }

    output[tid] = accum1 + accum2 + accum3 + accum4;

    }

    }

    int main() {

    const int N = 512*512*32;

    float* input = (float*) malloc(n_loop*N*sizeof(float));
    float* output = (float*) malloc(N*sizeof(float));
    float* output2 = (float*) malloc(N*sizeof(float));
    float* outputif = (float*) malloc(N*sizeof(float));

    float* d_input; gpuErrchk(cudaMalloc((void**)&d_input, n_loop*N*sizeof(float)));
    float* d_output; gpuErrchk(cudaMalloc((void**)&d_output, N*sizeof(float)));

    srand(time(NULL));
    for (int i=0; i<n_loop*N; i++) input[i] = rand() / (float)RAND_MAX;

    gpuErrchk(cudaMemcpy(d_input, input, n_loop*N*sizeof(float), cudaMemcpyHostToDevice));

    // --- Host-side computations
    for (int k = 0; k < N; k++) {
    float accum1 = 0.f;
    float accum2 = 0.f;
    for (int i = 0; i < n_loop; i++) {
    accum1 = accum1 + input[n_loop*k+i];
    if (input[n_loop*k+i] < epsilon) accum2 = accum2 + input[n_loop*k+i];
    }
    output[k] = accum1;
    outputif[k] = accum2;
    }

    // --- Device-side computation - kernel1
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    testKernel1<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Kernel1 elapsed time: %3.4f ms \n", time);

    gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Check CPU and GPU results
    for (int i=0; i<N; i++)
    if (output[i] != output2[i]) {
    printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, output[i], output2[i]);
    return 1;
    }
    printf("kernel1: results match!\n");

    // --- Device-side computation - kernel2
    cudaEventRecord(start, 0);

    testKernel2<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Kernel1 elapsed time: %3.4f ms \n", time);

    gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Check CPU and GPU results
    for (int i=0; i<N; i++)
    if (outputif[i] != output2[i]) {
    printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, outputif[i], output2[i]);
    return 1;
    }
    printf("kernel2: results match!\n");

    // --- Device-side computation - kernel3
    cudaEventRecord(start, 0);

    testKernel3<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, n_loop, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Kernel3 elapsed time: %3.4f ms \n", time);

    gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Check CPU and GPU results
    for (int i=0; i<N; i++)
    if (output[i] != output2[i]) {
    printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, output[i], output2[i]);
    return 1;
    }
    printf("kernel3: results match!\n");

    // --- Device-side computation - kernel4
    cudaEventRecord(start, 0);

    testKernel4<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, n_loop, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Kernel4 elapsed time: %3.4f ms \n", time);

    gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Check CPU and GPU results
    for (int i=0; i<N; i++)
    if (abs(output[i] - output2[i]) > 0.0001) {
    printf("Mismatch at i = %d, Host= %f, Device = %f, difference = %f\n", i, output[i], output2[i], output2[i] - output[i]);
    return 1;
    }
    printf("kernel4: results match!\n");

    return 0;

    }

    现在让我们分析四种不同情况的反汇编代码(使用 CUDA 6.0 编译)。我正在考虑对 Fermi 架构进行编译。

    内核 1
         MOV R1, c[0x1][0x100];
    S2R R0, SR_CTAID.X;
    IMUL R2, R0, c[0x0][0x8];
    S2R R3, SR_TID.X;
    IADD R0, R2, R3;
    ISETP.GE.AND P0, PT, R0, c[0x0][0x28], PT;
    @P0 BRA.U 0xd8;
    @!P0 IADD R2, R3, R2;
    @!P0 ISCADD R2, R2, c[0x0][0x20], 0x5;
    @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2;
    @!P0 LD R9, [R2];
    @!P0 LD R8, [R2+0x4];
    @!P0 LD R7, [R2+0x8];
    @!P0 LD R6, [R2+0xc];
    @!P0 LD R5, [R2+0x10];
    @!P0 LD R4, [R2+0x14];
    @!P0 LD R3, [R2+0x18];
    @!P0 LD R2, [R2+0x1c];
    @!P0 F2F.F32.F32 R9, R9;
    @!P0 FADD R8, R9, R8;
    @!P0 FADD R7, R8, R7;
    @!P0 FADD R6, R7, R6;
    @!P0 FADD R5, R6, R5;
    @!P0 FADD R4, R5, R4;
    @!P0 FADD R3, R4, R3;
    @!P0 FADD R2, R3, R2;
    @!P0 ST [R0], R2;
    EXIT;

    在这种情况下,编译器完全展开循环。你会看到 8不同的负载 ( LD ) 指令和 7不同的添加 ( FADD ) 指令。

    内核 2
        MOV R1, c[0x1][0x100];
    S2R R0, SR_CTAID.X;
    IMUL R0, R0, c[0x0][0x8];
    S2R R2, SR_TID.X;
    IADD R3, R0, R2;
    ISETP.GE.AND P0, PT, R3, c[0x0][0x28], PT;
    @P0 EXIT;
    IADD R0, R2, R0;
    ISCADD R9, R0, c[0x0][0x20], 0x5;
    LD R0, [R9];
    LD R2, [R9+0x4];
    LD R4, [R9+0x8];
    LD R5, [R9+0xc];
    LD R6, [R9+0x10];
    LD R7, [R9+0x14];
    LD R8, [R9+0x18];
    LD R9, [R9+0x1c];
    FSETP.LT.AND P0, PT, R0, 0.5, PT;
    FSETP.LT.AND P1, PT, R4, 0.5, PT;
    F2F.F32.F32 R0, R0;
    SEL R0, R0, RZ, P0;
    FSETP.LT.AND P0, PT, R2, 0.5, PT;
    @P0 FADD R0, R0, R2;
    FSETP.LT.AND P0, PT, R5, 0.5, PT;
    @P1 FADD R0, R0, R4;
    @P0 FADD R0, R0, R5;
    FSETP.LT.AND P1, PT, R8, 0.5, PT;
    FSETP.LT.AND P0, PT, R6, 0.5, PT;
    FADD R2, R0, R6;
    SEL R2, R2, R0, P0;
    FSETP.LT.AND P0, PT, R7, 0.5, PT;
    ISCADD R0, R3, c[0x0][0x24], 0x2;
    @P0 FADD R2, R2, R7;
    FSETP.LT.AND P0, PT, R9, 0.5, PT;
    @P1 FADD R2, R2, R8;
    @P0 FADD R2, R2, R9;
    ST [R0], R2;
    EXIT;

    同样在这种情况下,编译器正在完全展开循环。你会再次看到 8不同的负载 ( LD ) 指令和 7不同的添加 ( FADD ) 指令。

    内核 3
    c[0x0][0x30]    = N
    c[0x1][0x100] = BLOCKSIZE
    c[0x0][0x8] = blockDim.x
    c[0x0][0x30] = N_loop
    c[0x0][0x20] = input

    /*0000*/ MOV R1, c[0x1][0x100]; R1 = BLOCKSIZE = 256
    /*0008*/ S2R R0, SR_CTAID.X; R0 = blockIdx.x
    /*0010*/ S2R R2, SR_TID.X; R2 = threadIdx.x
    /*0018*/ IMAD R0, R0, c[0x0][0x8], R2; R0 = tid = blockIDx.x * blockDim.x + threadIdx.x
    /*0020*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x34], PT; P0 = (tid >= N) then EXIT
    /*0028*/ @P0 EXIT;
    /*0030*/ ISETP.LT.AND P0, PT, RZ, c[0x0][0x30], PT; P0 = (0 < N_loop)
    /*0038*/ @P0 BRA 0x60;
    /*0040*/ MOV R4, RZ;
    /*0048*/ BRA 0x170;
    /*0050*/ NOP;
    /*0058*/ NOP;
    /*0060*/ MOV R2, c[0x0][0x30]; R2 = N_loop
    /*0068*/ IMUL R3, R0, c[0x0][0x30]; R3 = tid * N_loop
    /*0070*/ MOV32I R6, 0x4; R6 = sizeof(float) = 4
    /*0078*/ ISETP.GT.AND P0, PT, R2, 0x3, PT; P0 = (R2 >= 3)
    /*0080*/ IMAD R2.CC, R3, R6, c[0x0][0x20]; R2 = R3 * R6 + input = tid * N_loop * 4 + input
    /*0088*/ MOV R4, RZ; R4 = 0
    /*0090*/ MOV R5, RZ; R5 = 0
    /*0098*/ IMAD.HI.X R3, R3, R6, c[0x0][0x24];
    /*00a0*/ @!P0 BRA 0x128;
    /*00a8*/ MOV R6, c[0x0][0x30]; R6 = N_loop
    /*00b0*/ IADD R10, R6, -0x3; R10 = N_loop - 3
    /*00b8*/ NOP;
    /*00c0*/ IADD R5, R5, 0x4; R5 = R5 + 4 = 4
    /*00c8*/ LD.E R6, [R2]; R6 = input[tid * N_loop]
    /*00d0*/ ISETP.LT.AND P0, PT, R5, R10, PT; P0 = (4 < (N_loop - 3))
    /*00d8*/ LD.E R7, [R2+0x4]; R7 = input[tid * N_loop + 1]
    /*00e0*/ LD.E R8, [R2+0x8]; R8 = input[tid * N_loop + 2]
    /*00e8*/ LD.E R9, [R2+0xc]; R9 = input[tid * N_loop + 3]
    /*00f0*/ IADD R2.CC, R2, 0x10; R2 = R2 + 16 = R2 + 4 * sizeof(float)
    /*00f8*/ IADD.X R3, R3, RZ;
    /*0100*/ FADD R6, R4, R6; R6 = 0 + input[tid * N_loop]
    /*0108*/ FADD R4, R6, R7; R4 = input[tid * N_loop] + input[tid * N_loop + 1]
    /*0110*/ FADD R8, R4, R8; R8 = input[tid * N_loop] + input[tid * N_loop + 1] + input[tid * N_loop + 2]
    /*0118*/ FADD R4, R8, R9; R4 = input[tid * N_loop] + input[tid * N_loop + 1] + input[tid * N_loop + 2] + input[tid * N_loop + 3]
    /*0120*/ @P0 BRA 0xc0; ...
    /*0128*/ ISETP.LT.AND P0, PT, R5, c[0x0][0x30], PT;
    /*0130*/ @!P0 BRA 0x170;
    /*0138*/ IADD R5, R5, 0x1;
    /*0140*/ LD.E R6, [R2];
    /*0148*/ ISETP.LT.AND P0, PT, R5, c[0x0][0x30], PT;
    /*0150*/ IADD R2.CC, R2, 0x4;
    /*0158*/ IADD.X R3, R3, RZ;
    /*0160*/ FADD R4, R4, R6;
    /*0168*/ @P0 BRA 0x138;
    /*0170*/ MOV32I R3, 0x4;
    /*0178*/ IMAD R2.CC, R0, R3, c[0x0][0x28];
    /*0180*/ IMAD.HI.X R3, R0, R3, c[0x0][0x2c];
    /*0188*/ ST.E [R2], R4;
    /*0190*/ EXIT;

    可以看出,编译器自动执行 4的循环展开。 ,如我所见 4加载操作 ( LD ) 和 3不同的添加 ( FADD )

    内核 4
    /*0000*/         MOV R1, c[0x1][0x100];
    /*0008*/ S2R R0, SR_CTAID.X;
    /*0010*/ S2R R2, SR_TID.X;
    /*0018*/ IMAD R13, R0, c[0x0][0x8], R2;
    /*0020*/ ISETP.GE.AND P0, PT, R13, c[0x0][0x34], PT;
    /*0028*/ @P0 EXIT;
    /*0030*/ MOV R2, c[0x0][0x30];
    /*0038*/ SHR R0, R2, 0x1f;
    /*0040*/ ISETP.GT.AND P0, PT, R2, 0x3, PT;
    /*0048*/ IMAD.U32.U32.HI R0, R0, 0x4, R2;
    /*0050*/ SHR R0, R0, 0x2;
    /*0058*/ @P0 BRA 0x98;
    /*0060*/ MOV R18, RZ;
    /*0068*/ MOV R19, RZ;
    /*0070*/ MOV R10, RZ;
    /*0078*/ MOV R11, RZ;
    /*0080*/ BRA 0x308;
    /*0088*/ NOP;
    /*0090*/ NOP;
    /*0098*/ MOV R3, c[0x0][0x30];
    /*00a0*/ IMUL R4, R13, c[0x0][0x30];
    /*00a8*/ MOV32I R5, 0x4;
    /*00b0*/ IMUL R2, R3, 0x3;
    /*00b8*/ SHL R6, R3, 0x1;
    /*00c0*/ IADD R10, R0, R4;
    /*00c8*/ SHR R3, R2, 0x1f;
    /*00d0*/ IMAD R8.CC, R4, R5, c[0x0][0x20];
    /*00d8*/ SHR R7, R6, 0x1f;
    /*00e0*/ IMAD.U32.U32.HI R2, R3, 0x4, R2;
    /*00e8*/ IMAD.HI.X R9, R4, R5, c[0x0][0x24];
    /*00f0*/ IMAD.U32.U32.HI R7, R7, 0x4, R6;
    /*00f8*/ IMAD.HI R3, R2, c[0x10][0x0], R4;
    /*0100*/ IMAD R6.CC, R10, R5, c[0x0][0x20];
    /*0108*/ ISETP.GT.AND P0, PT, R0, 0x1, PT;
    /*0110*/ IMAD.HI R14, R7, c[0x10][0x0], R4;
    /*0118*/ MOV R18, RZ;
    /*0120*/ IMAD.HI.X R7, R10, R5, c[0x0][0x24];
    /*0128*/ MOV R19, RZ;
    /*0130*/ IMAD R2.CC, R3, R5, c[0x0][0x20];
    /*0138*/ MOV R10, RZ;
    /*0140*/ IMAD.HI.X R3, R3, R5, c[0x0][0x24];
    /*0148*/ MOV R11, RZ;
    /*0150*/ IMAD R4.CC, R14, R5, c[0x0][0x20];
    /*0158*/ MOV R12, RZ;
    /*0160*/ IMAD.HI.X R5, R14, R5, c[0x0][0x24];
    /*0168*/ @!P0 BRA 0x260;
    /*0170*/ IADD R16, R0, -0x1;
    /*0178*/ NOP;
    /*0180*/ IADD R12, R12, 0x2;
    /*0188*/ LD.E R15, [R8];
    /*0190*/ ISETP.LT.AND P0, PT, R12, R16, PT;
    /*0198*/ LD.E R20, [R6];
    /*01a0*/ FADD R17, R18, R15;
    /*01a8*/ LD.E R14, [R4];
    /*01b0*/ FADD R19, R19, R20;
    /*01b8*/ LD.E R15, [R2];
    /*01c0*/ LD.E R18, [R8+0x4];
    /*01c8*/ LD.E R20, [R6+0x4];
    /*01d0*/ IADD R6.CC, R6, 0x8;
    /*01d8*/ NOP;
    /*01e0*/ FADD R14, R10, R14;
    /*01e8*/ FADD R15, R11, R15;
    /*01f0*/ IADD.X R7, R7, RZ;
    /*01f8*/ LD.E R10, [R4+0x4];
    /*0200*/ IADD R4.CC, R4, 0x8;
    /*0208*/ LD.E R11, [R2+0x4];
    /*0210*/ IADD.X R5, R5, RZ;
    /*0218*/ FADD R18, R17, R18;
    /*0220*/ IADD R2.CC, R2, 0x8;
    /*0228*/ FADD R19, R19, R20;
    /*0230*/ IADD.X R3, R3, RZ;
    /*0238*/ IADD R8.CC, R8, 0x8;
    /*0240*/ IADD.X R9, R9, RZ;
    /*0248*/ FADD R10, R14, R10;
    /*0250*/ FADD R11, R15, R11;
    /*0258*/ @P0 BRA 0x180;
    /*0260*/ ISETP.LT.AND P0, PT, R12, R0, PT;
    /*0268*/ @!P0 BRA 0x308;
    /*0270*/ IADD R12, R12, 0x1;
    /*0278*/ LD.E R17, [R8];
    /*0280*/ ISETP.LT.AND P0, PT, R12, R0, PT;
    /*0288*/ LD.E R16, [R6];
    /*0290*/ IADD R6.CC, R6, 0x4;
    /*0298*/ LD.E R15, [R4];
    /*02a0*/ IADD.X R7, R7, RZ;
    /*02a8*/ LD.E R14, [R2];
    /*02b0*/ IADD R4.CC, R4, 0x4;
    /*02b8*/ IADD.X R5, R5, RZ;
    /*02c0*/ IADD R2.CC, R2, 0x4;
    /*02c8*/ IADD.X R3, R3, RZ;
    /*02d0*/ IADD R8.CC, R8, 0x4;
    /*02d8*/ IADD.X R9, R9, RZ;
    /*02e0*/ FADD R18, R18, R17;
    /*02e8*/ FADD R19, R19, R16;
    /*02f0*/ FADD R10, R10, R15;
    /*02f8*/ FADD R11, R11, R14;
    /*0300*/ @P0 BRA 0x270;
    /*0308*/ FADD R0, R18, R19;
    /*0310*/ MOV32I R3, 0x4;
    /*0318*/ FADD R0, R0, R10;
    /*0320*/ IMAD R2.CC, R13, R3, c[0x0][0x28];
    /*0328*/ FADD R0, R0, R11;
    /*0330*/ IMAD.HI.X R3, R13, R3, c[0x0][0x2c];
    /*0338*/ ST.E [R2], R0;
    /*0340*/ EXIT;

    在这种情况下,编译器会自动执行 4 的循环展开,叠加到 4的手动循环展开,如我所见 8加载操作 ( LD ) 和 7不同的添加 ( FADD )。

    尽管反汇编的代码与 Fermi 架构的代码不同,但 Kepler 架构的编译器行为也相似。

    由于具有自动循环展开功能,不同内核之间的性能差异不大:
    GT 210 (c.c. 1.2)

    Kernel 1 = 111ms
    Kernel 2 = 108ms
    Kernel 3 = 107ms
    Kernel 4 = 110ms

    Kepler K20c (c.c. 3.5)

    Kernel 1 = 1.8ms
    Kernel 2 = 1.8ms
    Kernel 3 = 1.8ms
    Kernel 4 = 1.8ms

    我没有明确提供 Fermi 架构的结果,但四个考虑的内核的时间大致相同。

    关于cuda - NVIDIA CUDA 编译器自动展开循环,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/17446448/

    25 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com