CUDA/C 矩阵乘法-6ren

CUDA/C 矩阵乘法

转载作者：行者123 更新时间：2023-11-30 16:01:41

有人可以告诉我我在这里做错了什么吗？我正在尝试创建一个程序，使用 cuda 将矩阵返回到幂。似乎 cudaMemcpy (ln103) 没有返回结果数组。我通过返回矩阵中的第一个元素来检查它，但我总是只得到 0。也许我的内核有问题？将不胜感激任何帮助:

编辑:我应该澄清一下，迭代内核(从矩阵乘以相应的单位矩阵开始，然后乘以此后的每个结果)直到 k 次，从而给出矩阵的幂。

即A 是一个矩阵A^0 = I(单位矩阵)A^k = A^(k-1)*A

输入:

<n>
<power>
<element>
.....

代码:

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/resource.h>

#define BLOCK 8
#define SIZE (BLOCK * 64)
#define TILE_SIZE (8)

int n;


float *
create_matrix_h(unsigned int w, unsigned int h) {
  float *m;
  m = (float *) malloc(w * h * sizeof(float));
  if (m == NULL) {
    fprintf(stderr, "Failed to malloc.\n");
    exit(1);
  }
  return m;
}

__global__ void
kernel3(const float *m1, const float *m2, float *m3, unsigned int width) {
  const unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
  const unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
  unsigned int t, i;
  float result = 0, a, b;

  for (t = 0; t < width / TILE_SIZE; ++t) {
    for (i = 0; i != TILE_SIZE; ++i) {
      a = m1[row*width + t*TILE_SIZE + i];
      b = m2[(t*TILE_SIZE + i)*width + col];
      result += a * b;
    }
    __syncthreads();
  }
  m3[row*width + col] = result;
}

float *
create_matrix_d(int w, int h) {
  float *m;
  if (cudaMalloc(&m, w * h * sizeof(float)) == cudaErrorMemoryAllocation) {
    fprintf(stderr, "Failed to cudaMalloc.\n");
    return NULL;
    //exit(1);
  }
  return m;
}

void
fill_matrix_h(float *const m, int w, int h, float *const values, int nvalues) {
  int i, j = 0;
  for (i = 0; i != w * h; ++i) {
    m[i] = values[j];
    j = (j + 1) % nvalues;
  }
}

int
main(void) {
    int k;
    if (scanf("%d", &n) !=1 || n<1){
        return 0;
    }
    if (scanf(" %d", &k) !=1 || k<0){
        return 0;
    }
    float *hm[3], *dm[3];
    dim3 bdim(TILE_SIZE, TILE_SIZE);
    dim3 gdim(SIZE/TILE_SIZE, SIZE/TILE_SIZE);
    int i;
    for(i=0; i<3; ++i) {
        hm[i] = create_matrix_h(SIZE, SIZE);
        dm[i] = create_matrix_d(SIZE, SIZE);
    }
    float tem[n*n];
    for(i=0; i<n*n; ++i) {
        if (scanf(" %f", &tem[i]) !=1){
            return 0;
        }
    }
    float temid[n*n];
    int j = 0;
    for (i = 0; i != n*n; ++i) {
        if (i==0 || i == j + (n+1)) {
            temid[i] = 1;
            j = i;
        }
        else {
            temid[i] = 0;
        }
    }
    fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
    fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float));
    cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
    int w;
    for (w=0; w<k; ++w) {
        cudaMemcpy(dm[1], hm[1], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
        kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
            cudaThreadSynchronize();
        cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
        hm[1] = hm[2];
    }
    printf(" %.3f ", hm[2][0]);
    return 0;

 }

感谢您的回复帕万。现在，当我运行它时，我在内核调用中得到一个无限循环，输入如下。还有新代码。感谢您的帮助

新代码:

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/resource.h>

#define BLOCK 8
#define SIZE (BLOCK * 64)
#define TILE_SIZE (8)

int n;


float *
create_matrix_h(unsigned int w, unsigned int h) {
  float *m;
  m = (float *) malloc(w * h * sizeof(float));
  if (m == NULL) {
    fprintf(stderr, "Failed to malloc.\n");
    exit(1);
  }
  return m;
}

void
print_matrix(const float *m, const int w, const int h) {
  int x, y;
  for (y = 0; y != h; ++y) {
    for (x = 0; x != w; ++x)
      printf("%.03f ", m[y*w + x]);
    printf("\n");
  }
}


void
cpu_mult(const float *m1, const float *m2, float *m3, unsigned int width) {
  unsigned int i, j, k;
  float result;

  for (i = 0; i != width; ++i) {
    for (j = 0; j != width; ++j) {
      result = 0;
      for (k = 0; k != width; ++k)
        result += m1[i*width + k] * m2[k*width + j];
      m3[i*width + j] = result;
    }
  }
}


__global__ void
kernel3(const float *m1, const float *m2, float *m3, unsigned int width) {
  const unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
  const unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
  unsigned int t, i;
  float result = 0, a, b;

  for (t = 0; t < width / TILE_SIZE; ++t) {
    for (i = 0; i != TILE_SIZE; ++i) {
      a = m1[row*width + t*TILE_SIZE + i];
      b = m2[(t*TILE_SIZE + i)*width + col];
      result += a * b;
    }
    __syncthreads();
  }
  m3[row*width + col] = result;
}

float *
create_matrix_d(int w, int h) {
  float *m;
  if (cudaMalloc(&m, w * h * sizeof(float)) == cudaErrorMemoryAllocation) {
    fprintf(stderr, "Failed to cudaMalloc.\n");
    return NULL;
    //exit(1);
  }
  return m;
}

void
fill_matrix_h(float *const m, int w, int h, float *const values, int nvalues) {
  int i, j = 0;
  for (i = 0; i != w * h; ++i) {
    m[i] = values[j];
    j = (j + 1) % nvalues;
  }
}

int
main(void) {
    int k;
    if (scanf("%d", &n) !=1 || n<1){
        return 0;
    }
    if (scanf(" %d", &k) !=1 || k<0){
        return 0;
    }
    float *hm[3], *dm[3];
    dim3 bdim(TILE_SIZE, TILE_SIZE);
    dim3 gdim(SIZE/TILE_SIZE, SIZE/TILE_SIZE);
    int i;
    for(i=0; i<3; ++i) {
        hm[i] = create_matrix_h(SIZE, SIZE);
        dm[i] = create_matrix_d(SIZE, SIZE);
    }
    float tem[n*n];
    for(i=0; i<n*n; ++i) {
        if (scanf(" %f", &tem[i]) !=1){
            return 0;
        }
    }
    float temid[n*n];
    int j = 0;
    for (i = 0; i != n*n; ++i) {
        if (i==0 || j == n) { // not j + (n+1)
            temid[i] = 1;
            j=0;
        }
        else {
            temid[i] = 0;
            j++;
        }
    }
    fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
    fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float));
    cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
    dm[1] = dm[0]; // For the first iteration Result = A * A;
    int w;
    if (k==0) {
        hm[2] = hm[1];
    }
    else if (k==1) {
        hm[2] = hm[0];
    }
    else {
        for (w=1; w<k; ++w) {
           kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
           cudaThreadSynchronize();
           // No need to copy back to host
           // cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
           // Copy between device pointers
           cudaMemcpy(dm[1], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToDevice); 
        }
        cudaMemcpy(hm[2], dm[1], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost); 
    }



    print_matrix(hm[2], n, n);

    return 0;

 }

最佳答案

我相信我们在这里处于同一类(class)。 (comp2129)。为了响应您的无限循环，您的 block 大小/图 block 大小太小。将您的 block 设置为 16，然后重试。尽管 D: 我遇到了段错误。

关于CUDA/C 矩阵乘法，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/6225310/

文章推荐： objective-c - 多次 md5 一个字符串

文章推荐： javascript - 如何创建 HTML5 拖放编辑器？

文章推荐： javascript - 简单的 Javascript onclick 图片库不起作用

r - 矩阵 %in% 矩阵
假设我有两个矩阵，每个矩阵有两列和不同的行数。我想检查并查看一个矩阵的哪些对在另一个矩阵中。如果这些是一维的，我通常只会做 a %in% x得到我的结果。 match似乎只适用于向量。 > a
algorithm - 矩阵-矩阵乘法/矩阵-向量乘法有哪些不同类型的算法
关闭。这个问题是opinion-based .它目前不接受答案。想要改进这个问题？更新问题，以便 editing this post 可以用事实和引用来回答它. 关闭 9 个月前。 Improv
math - OpenGL 矩阵 VS DirectX 矩阵
我只处理过 DirectX 矩阵我读过一些文章，说不能将 DirectX 矩阵数学库用于 openGL 矩阵。但我也读过，如果你的数学是一致的，你可以获得类似的结果。那只会让我更加困惑。任何人都
c++ - 使用BLAS和OpenMP优化本征重组(矩阵-对角矩阵-矩阵)产品C++
我编写了一个C++代码来解决线性系统A.x = b，其中A是一个对称矩阵，方法是首先使用LAPACK(E)对角矩阵A = V.D.V^T(因为以后需要特征值)，然后求解x = A^-1.b = V^T
c++ - 动态创建一个 3x2 矩阵；打印它显示一个 2x2 矩阵
我遇到了问题。我想创建二维数组 rows=3 cols=2我的代码如下 int **ptr; int row=3; int col=2; ptr=new int *[col]; for (int i=
matlab - 从 3d 矩阵 Matlab 中获取 2d 矩阵
我有一个 3d mxnxt 矩阵，我希望能够提取 t 2d nxm 矩阵。在我的例子中，我有一个 1024x1024x10 矩阵，我想要 10 张图像显示给我。这不是 reshape ，我每次只需要
matlab - 将 3d 矩阵 reshape 为 2d 矩阵
我在 MATLAB 中有一个 3d 矩阵 (n-by-m-by-t) 表示一段时间内网格中的 n-by-m 测量值.我想要一个二维矩阵，其中空间信息消失了，只剩下 n*m 随着时间 t 的测量值(即:
python - 将 3D numpy 矩阵 reshape 为 2D numpy 矩阵，保持行位置
作为一个简化的示例，我有一个 3D numpy 矩阵，如下所示: a = np.array([[[1,2], [4,np.nan], [7,
python - 将 3D numpy 矩阵 reshape 为 2D numpy 矩阵，保持行位置
作为一个简化的示例，我有一个 3D numpy 矩阵，如下所示: a = np.array([[[1,2], [4,np.nan], [7,
c++ - 给定两个动态 R x C 矩阵，我如何交错行以生成一个 2R x C 矩阵？
使用 eigen2 , 并给定一个矩阵 A a_0_0, a_0_1, a_0_2, ... a_1_0, a_1_0, a_1_2, ... ... 和一个矩阵B: b_0_0, b_0_1, b_
html - 中型和大型设备上为 2 x 2 矩阵，小型设备上为 4 x 1 矩阵
我想知道如何获得下面的布局。在中型和大型设备上，我希望有 2 行和 2 列的布局(2 x 2 矩阵)。在小型(和超小型)设备上或调整为小型设备时，我想要一个 4 行和 1 列的矩阵。我将通过 a
matlab - 将(4D 矩阵 * 1D 向量)操作转换为独立的(3D 矩阵 * 0D 标量)操作，无需循环
有什么方法可以向量化以下内容: for i = 1:6 te = k(:,:,:,i).*(c(i)); end 我正在尝试将 4D 矩阵 k 乘以向量 c，方法是将其
随机抽样 - 矩阵
如何从填充有 1 和 0 的矩阵中抽取 n 个随机点的样本？ a=rep(0:1,5) b=rep(0,10) c=rep(1,10) dataset=matrix(cbind(a,b,c),nrow
JavaScript 矩阵
我正在尝试创建一个包含 X 个 X 的矩阵。以下代码生成从左上角到右下角的 X 对 Angular 线，而不是从右上角到左下角的 X 对 Angular 线。我不确定从哪里开始。是否应该使用新变量创建
Python 矩阵
我想在 python 中创建一个每行三列的矩阵，并能够通过任何一行对它们进行索引。矩阵中的每个值都是唯一的。据我所知，我可以设置如下矩阵: matrix = [["username", "name"
java如何创建不同对象的数组/矩阵
我有点迷茫我创建了一个名为 person 的类，它具有 age 和 name 属性(以及 get set 方法)。然后在另一个类中，我想创建一个 persons 数组，其中每个人都有不同的年龄和姓名
Java多维散列/矩阵
我有 n 个类，它们要么堆叠，要么不堆叠。所有这些类都扩展了同一个类 (CellObject)。我知道更多类将添加到此列表中，我想创建一种易于在一个地方操纵“可堆叠性”的方法。我正在考虑创建一个矩阵
Python模糊字符串匹配作为相关样式表/矩阵
我有一个包含 x 个字符串名称及其关联 ID 的文件。本质上是两列数据。我想要的是一个格式为 x x x 的相关样式表(将相关数据同时作为 x 轴和 y 轴)，但我想要 fuzzywuzzy 库的函
机器学习的数学基础--向量，矩阵
机器学习与传统编程的一个重要区别在于机器学习比传统编程涉及了更多的数学知识。不过，随着机器学习的飞速发展，各种框架应运而生，在数据分析等应用中使用机器学习时，使用现成的库和框架成为常态，似乎越来越不需
Julia 问题与结束，矩阵
当我在 julia 中输入这个错误跳转但我不知道为什么，它应该工作。/ julia> A = [1 2 3 4; 5 6 7 8; 1 2 3 4; 5 6 7 8] 4×4 Array{Int64,

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

CUDA/C 矩阵乘法