memory - cudamemcpy 错误 :"the launch timed out and was terminated"-6ren

memory - cudamemcpy 错误 :"the launch timed out and was terminated"

转载作者：太空宇宙更新时间：2023-11-04 04:43:19

我的代码是计算 pi 的第 n 位的并行实现。当我完成内核并尝试将内存复制回主机时，我收到“启动超时并被终止”错误。我使用此代码对每个 cudamalloc、cudamemcpy 和内核启动进行错误检查。

std::string error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);

这些调用表明一切正常，直到从内核返回后的第一个 cudamemcpy 调用。错误发生在“cudaMemcpy(avhost, avdev, size, cudaMemcpyDeviceToHost);”行中在主要。感谢您的帮助。

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#define mul_mod(a,b,m) fmod( (double) a * (double) b, m)
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the inverse of x mod y */
__device__ int inv_mod(int x,int y) {
  int q,u,v,a,c,t;

  u=x;
  v=y;
  c=1;
  a=0;
  do {
    q=v/u;

    t=c;
    c=a-q*c;
    a=t;

    t=u;
    u=v-q*u;
    v=t;
  } while (u!=0);
  a=a%y;
  if (a<0) a=y+a;
  return a;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the inverse of u mod v, if v is odd */
__device__ int inv_mod2(int u,int v) {
  int u1,u3,v1,v3,t1,t3;

  u1=1;
  u3=u;

  v1=v;
  v3=v;

  if ((u&1)!=0) {
    t1=0;
    t3=-v;
    goto Y4;
  } else {
    t1=1;
    t3=u;
  }

  do {

    do {
      if ((t1&1)==0) {
    t1=t1>>1;
    t3=t3>>1;
      } else {
    t1=(t1+v)>>1;
    t3=t3>>1;
      }
      Y4:;
    } while ((t3&1)==0);

    if (t3>=0) {
      u1=t1;
      u3=t3;
    } else {
      v1=v-t1;
      v3=-t3;
    }
    t1=u1-v1;
    t3=u3-v3;
    if (t1<0) {
      t1=t1+v;
    }
  } while (t3 != 0);
  return u1;
}


/* return (a^b) mod m */
__device__ int pow_mod(int a,int b,int m)
{
  int r,aa;

  r=1;
  aa=a;
  while (1) {
    if (b&1) r=mul_mod(r,aa,m);
    b=b>>1;
    if (b == 0) break;
    aa=mul_mod(aa,aa,m);
  }
  return r;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return true if n is prime */
int is_prime(int n)
{
   int r,i;
   if ((n % 2) == 0) return 0;

   r=(int)(sqrtf(n));
   for(i=3;i<=r;i+=2) if ((n % i) == 0) return 0;
   return 1;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the prime number immediatly after n */
int next_prime(int n)
{
   do {
      n++;
   } while (!is_prime(n));
   return n;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
#define DIVN(t,a,v,vinc,kq,kqinc)       \
{                       \
  kq+=kqinc;                    \
  if (kq >= a) {                \
    do { kq-=a; } while (kq>=a);        \
    if (kq == 0) {              \
      do {                  \
    t=t/a;                  \
    v+=vinc;                \
      } while ((t % a) == 0);           \
    }                       \
  }                     \
}

///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////

__global__ void digi_calc(int *s, int *av, int *primes, int N, int n, int nthreads){
    int a,vmax,num,den,k,kq1,kq2,kq3,kq4,t,v,i,t1, h;
    unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
// GIANT LOOP
    for (h = 0; h<1; h++){
    if(tid > nthreads) continue;
    a = primes[tid];
    vmax=(int)(logf(3*N)/logf(a));
    if (a==2) {
      vmax=vmax+(N-n);
      if (vmax<=0) continue;
    }
    av[tid]=1;
    for(i=0;i<vmax;i++) av[tid]*= a;

    s[tid]=0;
    den=1;
    kq1=0;
    kq2=-1;
    kq3=-3;
    kq4=-2;
    if (a==2) {
      num=1;
      v=-n; 
    } else {
      num=pow_mod(2,n,av[tid]);
      v=0;
    }

    for(k=1;k<=N;k++) {

      t=2*k;
      DIVN(t,a,v,-1,kq1,2);
      num=mul_mod(num,t,av[tid]);

      t=2*k-1;
      DIVN(t,a,v,-1,kq2,2);
      num=mul_mod(num,t,av[tid]);

      t=3*(3*k-1);
      DIVN(t,a,v,1,kq3,9);
      den=mul_mod(den,t,av[tid]);

      t=(3*k-2);
      DIVN(t,a,v,1,kq4,3);
      if (a!=2) t=t*2; else v++;
      den=mul_mod(den,t,av[tid]);

      if (v > 0) {
    if (a!=2) t=inv_mod2(den,av[tid]);
    else t=inv_mod(den,av[tid]);
    t=mul_mod(t,num,av[tid]);
    for(i=v;i<vmax;i++) t=mul_mod(t,a,av[tid]);
    t1=(25*k-3);                                                                                                                                                                                                                                                                                                                                                                       
    t=mul_mod(t,t1,av[tid]);
    s[tid]+=t;
    if (s[tid]>=av[tid]) s-=av[tid];
      }
    }

    t=pow_mod(5,n-1,av[tid]);
    s[tid]=mul_mod(s[tid],t,av[tid]);
    }
    __syncthreads();
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
int main(int argc,char *argv[])
{
  int N,n,i,totalp, h;
  double sum;
  const char *error;
  int *sdev, *avdev, *shost, *avhost, *adev, *ahost;
    argc = 2;
    argv[1] = "2";
  if (argc<2 || (n=atoi(argv[1])) <= 0) {
    printf("This program computes the n'th decimal digit of pi\n"
       "usage: pi n , where n is the digit you want\n"
       );
    exit(1);
  }
    sum = 0;
    N=(int)((n+20)*logf(10)/logf(13.5));
    totalp=(N/logf(N))+10;
    ahost = (int *)calloc(totalp, sizeof(int));
    i = 0;
    ahost[0]=2;
    for(i=1; ahost[i-1]<=(3*N); ahost[i+1]=next_prime(ahost[i])){
        i++;
    }
    // allocate host memory
    size_t size = i*sizeof(int);
    shost = (int *)malloc(size);
    avhost = (int *)malloc(size);

  //allocate memory on device
    cudaMalloc((void **) &sdev, size);
    cudaMalloc((void **) &avdev, size);
    cudaMalloc((void **) &adev, size);
    cudaMemcpy(adev, ahost, size, cudaMemcpyHostToDevice);

    if (i >= 512){
        h = 512;
    }
    else h = i;
    dim3 dimGrid(((i+512)/512),1,1);                   
    dim3 dimBlock(h,1,1);

    // launch kernel
    digi_calc <<<dimGrid, dimBlock >>> (sdev, avdev, adev, N, n, i);

    //copy memory back to host
    cudaMemcpy(avhost, avdev, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(shost, sdev, size, cudaMemcpyDeviceToHost);

  // end malloc's, memcpy's, kernel calls
    for(h = 0; h <=i; h++){
    sum=fmod(sum+(double) shost[h]/ (double) avhost[h],1.0);
    }
  printf("Decimal digits of pi at position %d: %09d\n",n,(int)(sum*1e9));
    //free memory
    cudaFree(sdev);
    cudaFree(avdev);
    cudaFree(adev);
    free(shost);
    free(avhost);
    free(ahost);
  return 0;
}

最佳答案

这与您在 this question 中询问的问题完全相同.内核被驱动程序提前终止，因为它完成时间太长。如果您阅读任何这些运行时 API 函数的文档，您将看到以下注释:

Note: Note that this function may also return error codes from previous, asynchronous launches.

所有发生的事情是内核启动后的第一个 API 调用返回内核运行时发生的错误 - 在本例中为 cudaMemcpy 调用。您可以自己确认这一点的方法是在内核启动后直接执行类似的操作:

// launch kernel
digi_calc <<<dimGrid, dimBlock >>> (sdev, avdev, adev, N, n, i);
std::string error = cudaGetErrorString(cudaPeekAtLastError());
printf("%s\n", error);
error = cudaGetErrorString(cudaThreadSynchronize());
printf("%s\n", error);

cudaPeekAtLastError() 调用将显示内核启动是否有任何错误，cudaThreadSynchronize() 调用返回的错误代码将显示是否有任何错误内核执行时产生错误。

解决方案与上一个问题中概述的完全相同:可能最简单的方法是重新设计代码，使其“可重入”，这样您就可以将工作分配给多个内核启动，每个内核都在显示驱动程序下安全启动看门狗定时器限制。

关于memory - cudamemcpy 错误 :"the launch timed out and was terminated"，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/23734688/

文章推荐： c - 将两个字符打包成一个整数

文章推荐： linux - Docker 区别 docker run[...] docker 容器运行 [...]

文章推荐： python - range(i+1) 和 [0]*(i+1) 之间的区别

terminator - Terminator 端子的大小可以保存在布局中吗？
我一直在尝试配置 Terminator 以尊重我的终端的大小。这可能吗？事实上，每次我重新打开 Terminator 时，我都必须将几个终端的大小调整为我想要的配置。首选项似乎没有配置这个选项。它似
terminal - "Terminal"想管理您的计算机。管理可以包括密码
我正在尝试通过命令在 Mac Catalina 10.15.5 中自动添加 crontab: echo -e "* * * * * \run.sh"|crontab - 此命令复制 crontab -
terminal - Mac Terminal-从png文件创建动画gif
我有一堆名为1.png，2.png等的png文件，我想从它们全部创建一个动画gif图像。我尚未成功找到将这些png文件转换为单个gif动画的终端命令的解决方案。有人可以发布一些我可以尝试的命令吗？我
戈朗 : How to terminate the bufio Scan() from terminal?
我正在运行 dup1 example来自 The Go Programming Language本书(相关代码如下所示): for input.Scan() { counts[input.Te
gnome-terminal - 如何从命令行启动 gnome-terminal 并复制环境变量？
有没有办法从命令行启动 gnome-terminal(即，使用/usr/bin/gnome-terminal 命令)并让新终端继承运行命令的终端的环境变量和其他设置变量?场景是这样的: 打开终端设置
terminal - 继续获取 : No such File or directory when I open the Terminal
每次我打开终端时，我都会收到这样的信息: bash: /usr/local/Cellar/nvm/0.33.0/nvm.sh: No such file or directory 我试过重新加载。不知
gnome-terminal - 如何更改 gnome-terminal 的背景颜色？
如何更改背景颜色？我选择了 Profile > Background > solid color ，但我在哪里可以选择颜色？最佳答案有两种受支持的方法可以更改 xterm 的背景颜色(请记住 xt
gnome-terminal - 如何从脚本在 gnome-terminal 中打开新选项卡
我知道我可以右键单击 > 打开选项卡以在 gnome-terminal 中打开一个新选项卡，但是如何从脚本中执行相同操作？如果我使用 'gnome-terminal --tab-with-profil
terminal - 纳米错误 : Error opening terminal: xterm-256color
安装 OSX Lion 后，我尝试: nano /etc/apt/sources.list 但我收到此错误: Error opening terminal: xterm-256color 如果我尝试切
mysql - "FIELDS TERMINATED BY"和 "LINES TERMINATED BY"有什么问题？
所以我创建了一个名为 specs 的 mysql 表，我想将一个 csv 文件导入到该表中。 CREATE TABLE specs ( `Id` INT NOT NULL, `Brand` V
terminal - 为什么我在终端 (xfce-terminal) 中只看到 8 种颜色？
我正在运行 Xubuntu 13.04，我想使用 Vim 作为我所有的默认编辑器。我下载了很多vim配色方案试了一下，但是都不像官方截图。例如，vim 自带的颜色模式——沙漠应该是这样的: 但在我的
terminal - 尽管安装了 Spyder-Terminal，但 Spyder 5 中没有终端
我在 Mac OS Big Sur 上安装了 Spyder 5。我从终端运行了这个命令:conda install spyder-terminal -c spyder-ide该命令运行无误。仍然没有
ssh - 如何处理 "Pseudo-terminal will not be allocated because stdin is not a terminal."
ssh -t remotehost vim /tmp/x.txt 我知道我可以运行上面这样的命令。但我希望能够在远程计算机上运行任何本地 bash 代码。因此，我想调用远程“bash -s”，以便可
php - LINES TERMINATED BY 和 FIELDS TERMINATED BY 的多种可能性 - MySQL
我正在尝试在我的内容管理系统中创建一项功能，用户可以上传 CSV 文件，然后解析该文件并将数据放入 MySQL 数据库中。为此，我使用文件输入和 SQL 查询。 $sql = "LOAD DATA L
terminal - completion-prefix-display-length 在我的 terminal.app 上不起作用
我在 .inputrc 上发现了很多问题，答案包括使用 Mac OS X terminal.app 的人。但是我已经设置了这个属性并且它没有改变 readline 的行为。我在 Ubuntu 上尝试
r - 派对套件 : Displaying terminal node percentile values above terminal node boxplots
我正在尝试绘制由 rpart 生成的回归树使用 partykit .假设使用的公式是 y ~ x1 + x2 + x3 + ... + xn .我想要实现的是在终端节点中带有箱线图的树，顶部有一个标签
terminal - VSCode : how to make Ctrl+k kill till the end of line in the terminal?
在 Visual Studio Code 中，Ctrl-k 快捷键绑定(bind)到 clearing the terminal . 我已经configured bash as my terminal
macos - 警告 : Plotting with an 'unknown' terminal. 将不会生成任何输出。请选择带有 'set terminal' 的终端
我正在学习斯坦福大学 Andrew Wu 教授的精彩机器学习类(class)。当我在 ex5 文件中执行绘图函数时，octave cli 命令行报告以下警告: set terminal aqua en
Python 多处理 : What is the better implementation to terminate daemon process when the parent process terminated?
我在 Ubuntu 上使用 Python 3.6 的多处理来处理与另一台设备的更快通信。我设置 daemon = True 以在父进程完成时终止子进程。但是，当主进程终止时，另一个进程(以下代码中的
windows - Apache Ant : Terminate process started by when ant process is terminated
我有一个 ant 任务，它使用执行冗长的构建操作. Ant 由 Windows 命令行中的批处理文件启动。如果我通过关闭窗口来终止 ant 任务，则进程由启动继续运行。当 ant 进程本身终止

太空宇宙

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

memory - cudamemcpy 错误 :"the launch timed out and was terminated"