gpt4 book ai didi

c++ - CUDA内核代码中矩阵的多次乘法

转载 作者:行者123 更新时间:2023-11-28 04:47:20 25 4
gpt4 key购买 nike

矩阵乘法函数:

__global__ void gpu_matrix_mult(float *a, float *b, float *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0;
if (col < k && row < m)
{
for (int i = 0; i < n; i++)
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}

然后在下面的循环中调用该函数:

int currentActivityCount = -1;

while (activityCount != currentActivityCount)
{
if (currentActivityCount > -1)
{
cudaMemcpy(d_b, h_b_new, sizeof(int)*m*k, cudaMemcpyHostToDevice);
}

gpu_matrix_mult << <dimGrid, dimBlock >> >(d_a, d_b, d_c, m, n, k);

cudaMemcpy(h_c, d_c, sizeof(int)*m*k, cudaMemcpyDeviceToHost);

currentActivityCount = activityCount;
activityCount = 0;

for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
if (h_c[i*k + j] >= 0.5)
{
activityCount++;

h_b_new[i * k + j] = 1;
}
else
{
h_b_new[i * k + j] = 0;
}
}
}

during++;
printf("Count of activity: %d During: %d\n", activityCount, during);
}

我的目标是将此循环移动到“gpu_matrix_mult”函数中,以便 GPU 之间的数据传输仅发生两次,这意味着调用函数之前和之后,而不是在循环的每次迭代中。我一直在尝试一些方法,但都没有奏效。该解决方案是否可行?

最佳答案

你可以像这样在内核中做一些事情:

__device__ int activityCount;
__global__ void gpu_matrix_mult(float *a, float *b0, float *b1, float *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0;
if (col < k && row < m)
{
for (int i = 0; i < n; i++)
{
sum += a[row * n + i] * b0[i * k + col];
}
c[row * k + col] = sum;
if (sum >= 0.5)
{
atomicAdd(&activityCount, 1);
b1[i * k + j] = 1;
}
else
{
b1[i * k + j] = 0;
}
}
}

// .............


int currentActivityCount = -1;
int activityCount_h = 0;
while (activityCount_h != currentActivityCount)
{
if (currentActivityCount > -1)
{
float *tmp = d_b0;
d_b0 = d_b1;
d_b1 = tmp;
}
currentActivityCount = activityCount_h;
activityCount_h = 0;
cudaMemcpyToSymbol(activityCount, &activityCount_h, sizeof(int));
gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b0, d_b1, d_c, m, n, k);
cudaMemcpyfromSymbol(&activityCount_h, activity, sizeof(int));

during++;
printf("Count of activity: %d During: %d\n", activityCount, during);
}

[显然从未编译或运行,使用风险自负]

即用于计算 activityCount 的内部循环可以在矩阵乘法之后在设备的内核中运行。这需要在 GPU 的内存中有两个 b 矩阵,但主机上只需要交换指针来更新它们,这基本上是零成本。每次外循环迭代两次将内存传输减少为单个整数,这将相当快。

关于c++ - CUDA内核代码中矩阵的多次乘法,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/49017221/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com