gpt4 book ai didi

cuda - cudaSetDevice() 对 CUDA 设备的上下文堆栈有什么作用?

转载 作者:行者123 更新时间:2023-12-04 02:33:39 28 4
gpt4 key购买 nike

假设我有一个与设备 i 关联的事件 CUDA 上下文,我现在调用cudaSetDevice(i) .发生什么了? :

  • 没有?
  • 主上下文替换栈顶?
  • 主要上下文被压入堆栈?

  • 它实际上似乎是不一致的。我已经编写了这个程序,在一台只有一个设备的机器上运行:
    #include <cuda.h>
    #include <cuda_runtime_api.h>
    #include <cassert>
    #include <iostream>

    int main()
    {
    CUcontext ctx1, primary;
    cuInit(0);
    auto status = cuCtxCreate(&ctx1, 0, 0);
    assert (status == (CUresult) cudaSuccess);
    cuCtxPushCurrent(ctx1);
    status = cudaSetDevice(0);
    assert (status == cudaSuccess);
    void* ptr1;
    void* ptr2;
    cudaMalloc(&ptr1, 1024);
    assert (status == cudaSuccess);
    cuCtxGetCurrent(&primary);
    assert (status == (CUresult) cudaSuccess);
    assert(primary != ctx1);
    status = cuCtxPushCurrent(ctx1);
    assert (status == (CUresult) cudaSuccess);
    cudaMalloc(&ptr2, 1024);
    assert (status == (CUresult) cudaSuccess);
    cudaSetDevice(0);
    assert (status == (CUresult) cudaSuccess);
    int i = 0;
    while (true) {
    status = cuCtxPopCurrent(&primary);
    if (status != (CUresult) cudaSuccess) { break; }
    std::cout << "Next context on stack (" << i++ << ") is " << (void*) primary << '\n';
    }
    }
    我得到以下输出:
    context ctx1 is 0x563ec6225e30
    primary context is 0x563ec61f5490
    Next context on stack (0) is 0x563ec61f5490
    Next context on stack (1) is 0x563ec61f5490
    Next context on stack(2) is 0x563ec6225e3
    这似乎是一种行为有时是一种替代,有时是一种插入。
    这是怎么回事?

    最佳答案

    TL;DR:根据您提供的代码,在您的特定用途的两个实例中,似乎 cudaSetDevice()正在替换堆栈顶部的上下文。
    让我们稍微修改一下您的代码,然后看看我们可以推断出代码中每个 API 调用对上下文堆栈的影响:

    $ cat t1759.cu
    #include <cuda.h>
    #include <cuda_runtime_api.h>
    #include <cassert>
    #include <iostream>
    void check(int j, CUcontext ctx1, CUcontext ctx2){
    CUcontext ctx0;
    int i = 0;
    while (true) {
    auto status = cuCtxPopCurrent(&ctx0);
    if (status != CUDA_SUCCESS) { break; }
    if (ctx0 == ctx1) std::cout << j << ":Next context on stack (" << i++ << ") is ctx1:" << (void*) ctx0 << '\n';
    else if (ctx0 == ctx2) std::cout << j << ":Next context on stack (" << i++ << ") is ctx2:" << (void*) ctx0 << '\n';
    else std::cout << j << ":Next context on stack (" << i++ << ") is unknown:" << (void*) ctx0 << '\n';
    }
    }
    void runtest(int i)
    {
    CUcontext ctx1, primary = NULL;
    cuInit(0);
    auto dstatus = cuCtxCreate(&ctx1, 0, 0); // checkpoint 1
    assert (dstatus == CUDA_SUCCESS);
    if (i == 1) {check(i,ctx1,primary); return;}// checkpoint 1
    dstatus = cuCtxPushCurrent(ctx1); // checkpoint 2
    assert (dstatus == CUDA_SUCCESS);
    if (i == 2) {check(i,ctx1,primary); return;}// checkpoint 2
    auto rstatus = cudaSetDevice(0); // checkpoint 3
    assert (rstatus == cudaSuccess);
    if (i == 3) {check(i,ctx1,primary); return;}// checkpoint 3
    void* ptr1;
    void* ptr2;
    rstatus = cudaMalloc(&ptr1, 1024); // checkpoint 4
    assert (rstatus == cudaSuccess);
    if (i == 4) {check(i,ctx1,primary); return;}// checkpoint 4
    dstatus = cuCtxGetCurrent(&primary); // checkpoint 5
    assert (dstatus == CUDA_SUCCESS);
    assert(primary != ctx1);
    if (i == 5) {check(i,ctx1,primary); return;}// checkpoint 5
    dstatus = cuCtxPushCurrent(ctx1); // checkpoint 6
    assert (dstatus == CUDA_SUCCESS);
    if (i == 6) {check(i,ctx1,primary); return;}// checkpoint 6
    rstatus = cudaMalloc(&ptr2, 1024); // checkpoint 7
    assert (rstatus == cudaSuccess);
    if (i == 7) {check(i,ctx1,primary); return;}// checkpoint 7
    rstatus = cudaSetDevice(0); // checkpoint 8
    assert (rstatus == cudaSuccess);
    if (i == 8) {check(i,ctx1,primary); return;}// checkpoint 8
    return;
    }

    int main(){
    for (int i = 1; i < 9; i++){
    cudaDeviceReset();
    runtest(i);}
    }
    $ nvcc -o t1759 t1759.cu -lcuda -std=c++11
    $ ./t1759
    1:Next context on stack (0) is ctx1:0x11087e0
    2:Next context on stack (0) is ctx1:0x1741160
    2:Next context on stack (1) is ctx1:0x1741160
    3:Next context on stack (0) is unknown:0x10dc520
    3:Next context on stack (1) is ctx1:0x1c5aa70
    4:Next context on stack (0) is unknown:0x10dc520
    4:Next context on stack (1) is ctx1:0x23eaa00
    5:Next context on stack (0) is ctx2:0x10dc520
    5:Next context on stack (1) is ctx1:0x32caf30
    6:Next context on stack (0) is ctx1:0x3a44ed0
    6:Next context on stack (1) is ctx2:0x10dc520
    6:Next context on stack (2) is ctx1:0x3a44ed0
    7:Next context on stack (0) is ctx1:0x41cfd90
    7:Next context on stack (1) is ctx2:0x10dc520
    7:Next context on stack (2) is ctx1:0x41cfd90
    8:Next context on stack (0) is ctx2:0x10dc520
    8:Next context on stack (1) is ctx2:0x10dc520
    8:Next context on stack (2) is ctx1:0x4959c70
    $
    基于上述内容,当我们继续执行代码中的每个 API 调用时:
    1.
            auto dstatus = cuCtxCreate(&ctx1, 0, 0);    // checkpoint 1
    1:Next context on stack (0) is ctx1:0x11087e0
    上下文创建还将新创建的上下文推送到堆栈上,如 here 所述.
    2.
            dstatus = cuCtxPushCurrent(ctx1);           // checkpoint 2
    2:Next context on stack (0) is ctx1:0x1741160
    2:Next context on stack (1) is ctx1:0x1741160
    毫不奇怪,将相同的上下文推送到堆栈上会为其创建另一个堆栈条目。
    3.
            auto rstatus = cudaSetDevice(0);            // checkpoint 3
    3:Next context on stack (0) is unknown:0x10dc520
    3:Next context on stack (1) is ctx1:0x1c5aa70
    cudaSetDevice() call 已将堆栈顶部替换为“未知”上下文。 (此时只是未知,因为我们还没有检索到“其他”上下文的句柄值)。
    4.
            rstatus = cudaMalloc(&ptr1, 1024);          // checkpoint 4
    4:Next context on stack (0) is unknown:0x10dc520
    4:Next context on stack (1) is ctx1:0x23eaa00
    由于此调用,堆栈配置没有差异。
    5.
            dstatus = cuCtxGetCurrent(&primary);        // checkpoint 5
    5:Next context on stack (0) is ctx2:0x10dc520
    5:Next context on stack (1) is ctx1:0x32caf30
    由于此调用,堆栈配置没有差异,但我们现在知道堆栈上下文的顶部是当前上下文(我们可以推测它是主上下文)。
    6.
            dstatus = cuCtxPushCurrent(ctx1);           // checkpoint 6
    6:Next context on stack (0) is ctx1:0x3a44ed0
    6:Next context on stack (1) is ctx2:0x10dc520
    6:Next context on stack (2) is ctx1:0x3a44ed0
    这里没有真正的惊喜。我们正在推 ctx1在堆栈上,因此堆栈有 3 个条目,第一个是驱动程序 API 创建的上下文,接下来的两个条目与步骤 5 中的堆栈配置相同,只是向下移动了一个堆栈位置。
    7.
            rstatus = cudaMalloc(&ptr2, 1024);          // checkpoint 7
    7:Next context on stack (0) is ctx1:0x41cfd90
    7:Next context on stack (1) is ctx2:0x10dc520
    7:Next context on stack (2) is ctx1:0x41cfd90
    同样,此调用对堆栈配置没有影响。
    8.
            rstatus = cudaSetDevice(0);                 // checkpoint 8
    8:Next context on stack (0) is ctx2:0x10dc520
    8:Next context on stack (1) is ctx2:0x10dc520
    8:Next context on stack (2) is ctx1:0x4959c70
    再一次,我们看到这里的行为是 cudaSetDevice() call 已将堆栈上下文的顶部替换为主上下文。
    我从您的测试代码中得出的结论是,我没有看到 cudaSetDevice() 的行为不一致。与代码中的各种运行时和驱动程序 API 调用混合时调用。
    在我看来,这种编程范式是疯狂的。我无法想象您为什么要以这种方式混合驱动程序 API 和运行时 API 代码。

    关于cuda - cudaSetDevice() 对 CUDA 设备的上下文堆栈有什么作用?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/62877646/

    28 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com