gpt4 book ai didi

c++ - CUDA:对传递给 GPU 的数组的每个第 n 个点进行分组

转载 作者:行者123 更新时间:2023-11-30 20:13:28 25 4
gpt4 key购买 nike

我正在尝试在外部 Unix 上使用 Tesla 卡在 CUDA 上实现 k-means 算法。我读取输入文件并将所有数据点的坐标存储在 dataX 和 dataY 数组中。下一步是选择每个 centreInterval 点并将其存储在 GPU 内存中分配的另一个数组中。但是,如果我所能得到的只是“段错误”并且由于明显的原因无法从内核打印任何类型的输出,我不知道如何检查问题是什么。

编辑2:我将此示例简化为最短的解决方案。我在处理过程中找到了解决方案,但决定提供该问题中尚未解决的版本,以更清楚地说明导致问题的原因。

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#define BLOCK_SIZE 16

// My kernel - Selects some centres at the beginning of algorithm and stores it at appropriate place
__global__ void kMeansSelectInitialCentres(float* d_dataX, float* d_dataY, float* d_centresX, float* d_centresY, int centreInterval) {

int i = blockIdx.x * blockDim.x + threadIdx.x;
int idx = i * centreInterval;
d_centresX[i] = d_dataX[idx];
d_centresY[i] = d_dataY[idx];
}

// Simplified example
int main(int argn, char ** argc) {

// My data - let's say it is 32 floats in each
int dataSize = 32;
float* dataX = new float[dataSize];
float* dataY = new float[dataSize];

// Fill arrays with numbers
for (int i = 0; i < dataSize; i++) {
dataX[i] = i;
dataY[i] = i;
}

// Interval - we select first number, then 1 + N * centreInterval
int centreInterval = 2;

// There I will store my results in program
int centreSize = dataSize / centreInterval;
float* centresX = new float[centreSize];
float* centresY = new float[centreSize];

// Pointers to the arrays stored in GPU memory
float* d_dataX;
float* d_dataY;
float* d_centresX;
float* d_centresY;

// Allocate memory for those arrays
// Calculate how much space in memory do we need for this
size_t d_centreSize = sizeof(float) * centreSize;
size_t d_dataSize = sizeof(float) * dataSize;

// Memory for raw data
cudaMalloc((void**)&d_dataX, d_dataSize);
cudaMalloc((void**)&d_dataY, d_dataSize);

// Copy raw data to the device memory so we can operate on it freely
cudaMemcpy(d_dataY, dataY, d_dataSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataX, dataX, d_dataSize, cudaMemcpyHostToDevice);

// Memory for centre results
cudaMalloc((void**)&d_centresX, d_dataSize);
cudaMalloc((void**)&d_centresY, d_dataSize);

// Call kernel
dim3 dimBlock(BLOCK_SIZE);
dim3 dimGridK((centreSize + dimBlock.x) / dimBlock.x);
kMeansSelectInitialCentres <<<dimGridK, dimBlock>>> (d_dataX, d_dataY, d_centresX, d_centresY, centreInterval);

// Check results - we get every n-th point
float* check_x = new float[centreSize];
float* check_y = new float[centreSize];

cudaMemcpy(check_x, d_centresX, d_dataSize, cudaMemcpyDeviceToHost);
cudaMemcpy(check_y, d_centresY, d_dataSize, cudaMemcpyDeviceToHost);

printf("X: ");
for (int i = 0; i < centreSize; i++)
printf("%.2f ", check_x[i]);
printf("\nY: ");
for (int i = 0; i < centreSize; i++)
printf("%.2f ", check_y[i]);
printf("\n");

}

主要问题:这个内核/数据 check out 有什么问题?

附带问题:在这种情况下有没有公平的方法来调试程序内核?

最佳答案

所以,这是我在简化案例后提出的解决方案。内存使用存在问题 - 我尝试存储/读取的数据量与分配时声称使用的数据量不同。我希望它对将来的任何人都有帮助:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#define BLOCK_SIZE 16

// My kernel - Selects some centres at the beginning of algorithm and stores it at appropriate place
__global__ void kMeansSelectInitialCentres(float* d_dataX, float* d_dataY, float* d_centresX, float* d_centresY, int centreInterval) {

int i = blockIdx.x * blockDim.x + threadIdx.x;
int idx = i * centreInterval;
d_centresX[i] = d_dataX[idx];
d_centresY[i] = d_dataY[idx];
}

// Simplified example
int main(int argn, char ** argc) {

// My data - let's say it is 32 floats in each
int dataSize = 32;
float* dataX = new float[dataSize];
float* dataY = new float[dataSize];

// Fill arrays with numbers
for (int i = 0; i < dataSize; i++) {
dataX[i] = i;
dataY[i] = i;
}

// Interval - we select first number, then 1 + N * centreInterval
int centreInterval = 2;

// There I will store my results in program
int centreSize = dataSize / centreInterval;
float* centresX = new float[centreSize];
float* centresY = new float[centreSize];

// Pointers to the arrays stored in GPU memory
float* d_dataX;
float* d_dataY;
float* d_centresX;
float* d_centresY;

// Allocate memory for those arrays
// Calculate how much space in memory do we need for this
size_t d_centreSize = sizeof(float) * centreSize;
size_t d_dataSize = sizeof(float) * dataSize;

// Memory for raw data
cudaMalloc((void**)&d_dataX, d_dataSize);
cudaMalloc((void**)&d_dataY, d_dataSize);

// Copy raw data to the device memory so we can operate on it freely
cudaMemcpy(d_dataY, dataY, d_dataSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataX, dataX, d_dataSize, cudaMemcpyHostToDevice);

// Memory for centre results
cudaMalloc((void**)&d_centresX, d_centreSize);
cudaMalloc((void**)&d_centresY, d_centreSize);

// Call kernel
dim3 dimBlock(BLOCK_SIZE);
dim3 dimGridK((centreSize + dimBlock.x) / dimBlock.x);
kMeansSelectInitialCentres <<<dimGridK, dimBlock>>> (d_dataX, d_dataY, d_centresX, d_centresY, centreInterval);

// Check results - we get every n-th point
float* check_x = new float[centreSize];
float* check_y = new float[centreSize];

cudaMemcpy(check_x, d_centresX, d_centreSize, cudaMemcpyDeviceToHost);
cudaMemcpy(check_y, d_centresY, d_centreSize, cudaMemcpyDeviceToHost);

printf("X: ");
for (int i = 0; i < centreSize; i++)
printf("%.2f ", check_x[i]);
printf("\nY: ");
for (int i = 0; i < centreSize; i++)
printf("%.2f ", check_y[i]);
printf("\n");

}

关于c++ - CUDA:对传递给 GPU 的数组的每个第 n 个点进行分组,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30547616/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com