- html - 出于某种原因,IE8 对我的 Sass 文件中继承的 html5 CSS 不友好?
- JMeter 在响应断言中使用 span 标签的问题
- html - 在 :hover and :active? 上具有不同效果的 CSS 动画
- html - 相对于居中的 html 内容固定的 CSS 重复背景?
我是 CUDA 的新手,我正在尝试将我为性能关键项目所做的一些繁琐计算卸载到 GPU。我的电脑上有两张 NVS 510 显卡,但我目前只试用一张。
我有一些大的列主矩阵(1000-5000 行 x 1-5 M 列)需要填充。到目前为止,我能够编写代码来像填充数组一样填充矩阵,并且它适用于相对较小的矩阵。
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, [other params],
int n_rows, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_rows;
int row = index % n_rows;
if (row > n_sim || column > num_cols) return;
d_matrix[index] = …something(row, column,[other params]);
}
内核调用:
fl_type *res;
cudaMalloc((void**)&res, n_columns*n_rows*fl_size);
int block_size = 1024;
int num_blocks = (n_rows* n_columns + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (res,[other params], n_rows,n_columns);
一切正常。如果我更改内核以使用 2D 线程:
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, [other params],
int n_rows, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column* n_rows + row;
if (row > n_rows || column > num_cols) return;
d_matrix[index] = …something(row, column,[other params]);
}
然后我调用它
int block_size2 = 32; //each block will have block_size2*block_size2 threads
dim3 num_blocks2(block_size2, block_size2);
int x_grid = (n_columns + block_size2 - 1) / block_size2;
int y_grid = (n_rows + block_size2 - 1) / block_size2;
dim3 grid_size2(x_grid, y_grid);
interp_kernel2D <<< grid_size2, num_blocks2 >>> (res,[other params], n_rows,n_columns);
结果全为零,CUDA 返回未知错误。我错过了什么?可以在此处找到使用 VS2015 和 CUDA 8.0 编译无误的实际代码:https://pastebin.com/XBCVC7VV
这是来自 pastebin 链接的代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <iostream>
#include <random>
#include <chrono>
typedef float fl_type;
typedef int pos_type;
typedef std::chrono::milliseconds ms;
//declaration of the cuda function
void cuda_interpolation_function(fl_type* interp_value_back, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim);
fl_type iterp_cpu(fl_type* weights, pos_type* node_map, fl_type* grid_values, int& row, int& column, int& interp_dim, int& n_sim) {
int w_p = column*interp_dim;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[node_map[w_p + inter_point] * n_sim + row];
}
return res;
}
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, pos_type* node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_sim;
int row = index % n_sim;
int w_p = column*interp_dim;
if (row > n_sim || column > num_cols) return;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + inter_point] * n_sim];
}
d_matrix[index] = res;
}
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, pos_type* node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column*n_sim + row;
int w_p = column*interp_dim;
if (row > n_sim || column > num_cols) return;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + inter_point] * n_sim];
}
d_matrix[index] = res;
}
void verify(fl_type *host, fl_type *device, int size) {
int count = 0;
int count_zero = 0;
for (int i = 0; i < size; i++) {
if (host[i] != device[i]) {
count++;
//std::cout <<"pos: " <<i<< " CPU:" <<h[i] << ", GPU: " << d[i] <<std::endl;
assert(host[i] == device[i]);
if (device[i] == 0.0)
count_zero++;
}
}
if (count) {
std::cout << "Non matching: " << count << "out of " << size << "(" << (float(count) / size * 100) << "%)" << std::endl;
std::cout << "Zeros returned from the device: " << count_zero <<"(" << (float(count_zero) / size * 100) << "%)" << std::endl;
}
else
std::cout << "Perfect match!" << std::endl;
}
int main() {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
int dim = 5; // range: 2-5
int number_nodes = 5500; // range: 10.000-500.000
int max_actions = 12; // range: 6-200
int n_sim = 1000; // range: 1.000-10.000
int interp_dim = std::pow(2, dim);
int grid_values_size = n_sim*number_nodes;
std::default_random_engine generator;
std::normal_distribution<fl_type> normal_dist(0.0, 1);
std::uniform_int_distribution<> uniform_dist(0, number_nodes - 1);
double bit_allocated = 0;
fl_type * grid_values; //flattened 2d array, containing the value of the grid (n_sims x number_nodes)
grid_values = (fl_type *)malloc(grid_values_size * fl_size);
bit_allocated += grid_values_size * fl_size;
for (int i = 0; i < grid_values_size; i++)
grid_values[i] = normal_dist(generator);
pos_type * map_node2values_start; //vector that maps each node to the first column of the result matrix regarding that done
pos_type * map_node2values_how_many; //vector that stores how many action we have per node
map_node2values_start = (pos_type *)malloc(number_nodes * pos_size);
map_node2values_how_many = (pos_type *)malloc(number_nodes * pos_size);
bit_allocated += 2 * (number_nodes * pos_size);
for (int i = 0; i < number_nodes; i++) {
//each node as simply max_actions
map_node2values_start[i] = max_actions*i;
map_node2values_how_many[i] = max_actions;
}
//total number of actions, which is amount of column of the results
int total_action_number = map_node2values_start[number_nodes - 1] + map_node2values_how_many[number_nodes - 1];
//vector that keep tracks of the columnt to grab, and their weight in the interpolation
fl_type* weights;
pos_type * node_map;
weights = (fl_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * fl_size;
node_map = (pos_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * pos_size;
//filling with random numbers
for (int i = 0; i < total_action_number*interp_dim; i++) {
node_map[i] = uniform_dist(generator); // picking random column
weights[i] = 1.0 / interp_dim; // uniform weights
}
std::cout << "done filling!" << std::endl;
std::cout << bit_allocated / 8 / 1024 / 1024 << "MB allocated" << std::endl;
int result_size = n_sim*total_action_number;
fl_type *interp_value_cpu;
bit_allocated += result_size* fl_size;
interp_value_cpu = (fl_type *)malloc(result_size* fl_size);
auto start = std::chrono::steady_clock::now();
for (int row = 0; row < n_sim; row++) {
for (int column = 0; column < total_action_number; column++) {
auto zz = iterp_cpu(weights, node_map, grid_values, row, column, interp_dim, n_sim);
interp_value_cpu[column*n_sim + row] = zz;
}
}
auto elapsed_cpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the CPU (serial): " << std::chrono::duration_cast<ms>(elapsed_cpu).count() / 1000.0 << "s" << std::endl;
int * pp;
cudaMalloc((void**)&pp, sizeof(int)); //initializing the device, to not affect the benchmark
fl_type *interp_value_gpu;
interp_value_gpu = (fl_type *)malloc(result_size* fl_size);
start = std::chrono::steady_clock::now();
cuda_interpolation_function(interp_value_gpu, result_size, grid_values, grid_values_size, weights, node_map, total_action_number, interp_dim, n_sim);
auto elapsed_gpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the GPU: " << std::chrono::duration_cast<ms>(elapsed_gpu).count() / 1000.0 << "s" << std::endl;
float ms_cpu = std::chrono::duration_cast<ms>(elapsed_cpu).count();
float ms_gpu = std::chrono::duration_cast<ms>(elapsed_gpu).count();
int n_proc = 4;
std::cout << "Performance: " << (ms_gpu- ms_cpu / n_proc) / (ms_cpu / n_proc) * 100 << " % less time than parallel CPU!" << std::endl;
verify(interp_value_cpu, interp_value_gpu, result_size);
free(interp_value_cpu);
free(interp_value_gpu);
free(grid_values);
free(node_map);
free(weights);
}
void cuda_interpolation_function(fl_type* interp_value_gpu, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim) {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
auto start = std::chrono::steady_clock::now();
//device versions of the inputs
fl_type * grid_values_device;
fl_type* weights_device;
pos_type * node_map_device;
fl_type *interp_value_device;
int lenght_node_map = interp_dim*total_action_number;
std::cout << "size grid_values: " << grid_values_size <<std::endl;
std::cout << "size weights: " << lenght_node_map << std::endl;
std::cout << "size interp_value: " << result_size << std::endl;
//allocating and moving to the GPU the inputs
auto error_code=cudaMalloc((void**)&grid_values_device, grid_values_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the grid_values" << std::endl;
}
error_code=cudaMemcpy(grid_values_device, grid_values, grid_values_size*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the grid_values" << std::endl;
}
error_code=cudaMalloc((void**)&weights_device, lenght_node_map*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the weights" << std::endl;
}
error_code=cudaMemcpy(weights_device, weights, lenght_node_map*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the weights" << std::endl;
}
error_code=cudaMalloc((void**)&node_map_device, lenght_node_map*pos_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of node_map" << std::endl;
}
error_code=cudaMemcpy(node_map_device, node_map, lenght_node_map*pos_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of node_map" << std::endl;
}
error_code=cudaMalloc((void**)&interp_value_device, result_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of interp_value_device " << std::endl;
}
auto elapsed_moving = std::chrono::steady_clock::now() - start;
float ms_moving = std::chrono::duration_cast<ms>(elapsed_moving).count();
cudaDeviceSynchronize();
//1d
int block_size = 1024;
int num_blocks = (result_size + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
//2d
//int block_size2 = 32; //each block will have block_size2*block_size2 threads
//dim3 num_blocks2(block_size2, block_size2);
//int x_grid = (total_action_number + block_size2 - 1) / block_size2;
//int y_grid = (n_sim + block_size2 - 1) / block_size2;
//dim3 grid_size2(x_grid, y_grid);
//std::cout <<"grid:"<< x_grid<<" x "<< y_grid<<std::endl;
//interp_kernel2D <<< grid_size2, num_blocks2 >>> (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
std::cout << "Cuda kernel failed! " << cudaGetErrorString(err) <<std::endl;
}
start = std::chrono::steady_clock::now();
cudaMemcpy(interp_value_gpu, interp_value_device, result_size*fl_size, cudaMemcpyDeviceToHost);
auto elapsed_moving_back = std::chrono::steady_clock::now() - start;
float ms_moving_back = std::chrono::duration_cast<ms>(elapsed_moving_back).count();
std::cout << "Time spent moving the data to the GPU:" << ms_moving << " ms"<<std::endl;
std::cout << "Time spent moving the results back to the host: " << ms_moving_back << " ms" << std::endl;
cudaFree(interp_value_device);
cudaFree(weights_device);
cudaFree(node_map_device);
cudaFree(grid_values_device);
}
此外,我将非常感谢任何关于如何提高代码性能的指导。
最佳答案
任何时候您在使用 CUDA 代码时遇到问题,我建议您进行适当的 CUDA 错误检查(您似乎大部分时间都在这样做),并且还使用 cuda-memcheck 运行您的代码
。最后一个实用程序类似于 Nsight VSE 中的“启用内存检查器”,但不完全相同。然而,Nsight VSE 内存检查器可能给了您相同的指示。
在 C(或 C++)中,数组的索引通常从 0 开始。因此,要测试越界索引,我必须检查生成的索引是否等于或大于 em> 数组的大小。但在您的情况下,您只测试大于:
if (row > n_sim || column > num_cols) return;
您在 1D 内核和 2D 内核中都犯了类似的错误,尽管您认为 1D 内核工作正常,但它实际上正在进行越界访问。如果您使用上述 cuda-memcheck
实用程序(或者可能还使用可以在 Nsight VSE 中启用的内存检查器)运行,您可以验证这一点。
当我在 pastebin 链接中修改您的代码以使用正确的范围/边界检查时,cuda-memcheck
报告没有错误,并且您的程序报告了正确的结果。我已经测试了这两种情况,但下面的代码是根据您的 pastebin 链接修改的,以取消注释 2D 情况,并使用它代替 1D 情况:
$ cat t375.cu | more
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <iostream>
#include <random>
#include <chrono>
typedef float fl_type;
typedef int pos_type;
typedef std::chrono::milliseconds ms;
//declaration of the cuda function
void cuda_interpolation_function(fl_type* interp_value_back, int result_size, fl
_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map,
int total_action_number, int interp_dim, int n_sim);
fl_type iterp_cpu(fl_type* weights, pos_type* node_map, fl_type* grid_values, in
t& row, int& column, int& interp_dim, int& n_sim) {
int w_p = column*interp_dim;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[node_map[w_p + inter_poi
nt] * n_sim + row];
}
return res;
}
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, pos_type* no
de_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_sim;
int row = index % n_sim;
int w_p = column*interp_dim;
if (row >= n_sim || column >= num_cols) return; // modified
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + int
er_point] * n_sim];
}
d_matrix[index] = res;
}
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, pos_type*
node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column*n_sim + row;
int w_p = column*interp_dim;
if (row >= n_sim || column >= num_cols) return; // modified
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + int
er_point] * n_sim];
}
d_matrix[index] = res;
}
void verify(fl_type *host, fl_type *device, int size) {
int count = 0;
int count_zero = 0;
for (int i = 0; i < size; i++) {
if (host[i] != device[i]) {
count++;
//std::cout <<"pos: " <<i<< " CPU:" <<h[i] << ", GPU: " << d[
i] <<std::endl;
assert(host[i] == device[i]);
if (device[i] == 0.0)
count_zero++;
}
}
if (count) {
std::cout << "Non matching: " << count << "out of " << size << "(" << (f
loat(count) / size * 100) << "%)" << std::endl;
std::cout << "Zeros returned from the device: " << count_zero <<"(" << (
float(count_zero) / size * 100) << "%)" << std::endl;
}
else
std::cout << "Perfect match!" << std::endl;
}
int main() {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
int dim = 5; // range: 2-5
int number_nodes = 5500; // range: 10.000-500.000
int max_actions = 12; // range: 6-200
int n_sim = 1000; // range: 1.000-10.000
int interp_dim = std::pow(2, dim);
int grid_values_size = n_sim*number_nodes;
std::default_random_engine generator;
std::normal_distribution<fl_type> normal_dist(0.0, 1);
std::uniform_int_distribution<> uniform_dist(0, number_nodes - 1);
double bit_allocated = 0;
fl_type * grid_values; //flattened 2d array, containing the value of the grid (n_sims x number_nodes)
grid_values = (fl_type *)malloc(grid_values_size * fl_size);
bit_allocated += grid_values_size * fl_size;
for (int i = 0; i < grid_values_size; i++)
grid_values[i] = normal_dist(generator);
pos_type * map_node2values_start; //vector that maps each node to the first column of the result matrix regarding that done
pos_type * map_node2values_how_many; //vector that stores how many action we have per node
map_node2values_start = (pos_type *)malloc(number_nodes * pos_size);
map_node2values_how_many = (pos_type *)malloc(number_nodes * pos_size);
bit_allocated += 2 * (number_nodes * pos_size);
for (int i = 0; i < number_nodes; i++) {
//each node as simply max_actions
map_node2values_start[i] = max_actions*i;
map_node2values_how_many[i] = max_actions;
}
//total number of actions, which is amount of column of the results
int total_action_number = map_node2values_start[number_nodes - 1] + map_node2values_how_many[number_nodes - 1];
//vector that keep tracks of the columnt to grab, and their weight in the interpolation
fl_type* weights;
pos_type * node_map;
weights = (fl_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * fl_size;
node_map = (pos_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * pos_size;
//filling with random numbers
for (int i = 0; i < total_action_number*interp_dim; i++) {
node_map[i] = uniform_dist(generator); // picking random column
weights[i] = 1.0 / interp_dim; // uniform weights
}
std::cout << "done filling!" << std::endl;
std::cout << bit_allocated / 8 / 1024 / 1024 << "MB allocated" << std::endl;
int result_size = n_sim*total_action_number;
fl_type *interp_value_cpu;
bit_allocated += result_size* fl_size;
interp_value_cpu = (fl_type *)malloc(result_size* fl_size);
auto start = std::chrono::steady_clock::now();
for (int row = 0; row < n_sim; row++) {
for (int column = 0; column < total_action_number; column++) {
auto zz = iterp_cpu(weights, node_map, grid_values, row, column, interp_dim, n_sim);
interp_value_cpu[column*n_sim + row] = zz;
}
}
auto elapsed_cpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the CPU (serial): " << std::chrono::duration_cast<ms>(elapsed_cpu).count() / 1000.0 << "s" << std::endl;
int * pp;
cudaMalloc((void**)&pp, sizeof(int)); //initializing the device, to not affect the benchmark
fl_type *interp_value_gpu;
interp_value_gpu = (fl_type *)malloc(result_size* fl_size);
start = std::chrono::steady_clock::now();
cuda_interpolation_function(interp_value_gpu, result_size, grid_values, grid_values_size, weights, node_map, total_action_number, interp_dim, n_sim);
auto elapsed_gpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the GPU: " << std::chrono::duration_cast<ms>(elapsed_gpu).count() / 1000.0 << "s" << std::endl;
float ms_cpu = std::chrono::duration_cast<ms>(elapsed_cpu).count();
float ms_gpu = std::chrono::duration_cast<ms>(elapsed_gpu).count();
int n_proc = 4;
std::cout << "Performance: " << (ms_gpu- ms_cpu / n_proc) / (ms_cpu / n_proc) * 100 << " % less time than parallel CPU!" << std::endl;
verify(interp_value_cpu, interp_value_gpu, result_size);
free(interp_value_cpu);
free(interp_value_gpu);
free(grid_values);
free(node_map);
free(weights);
}
void cuda_interpolation_function(fl_type* interp_value_gpu, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim) {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
auto start = std::chrono::steady_clock::now();
//device versions of the inputs
fl_type * grid_values_device;
fl_type* weights_device;
pos_type * node_map_device;
fl_type *interp_value_device;
int lenght_node_map = interp_dim*total_action_number;
std::cout << "size grid_values: " << grid_values_size <<std::endl;
std::cout << "size weights: " << lenght_node_map << std::endl;
std::cout << "size interp_value: " << result_size << std::endl;
//allocating and moving to the GPU the inputs
auto error_code=cudaMalloc((void**)&grid_values_device, grid_values_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the grid_values" << std::endl;
}
error_code=cudaMemcpy(grid_values_device, grid_values, grid_values_size*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the grid_values" << std::endl;
}
error_code=cudaMalloc((void**)&weights_device, lenght_node_map*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the weights" << std::endl;
}
error_code=cudaMemcpy(weights_device, weights, lenght_node_map*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the weights" << std::endl;
}
error_code=cudaMalloc((void**)&node_map_device, lenght_node_map*pos_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of node_map" << std::endl;
}
error_code=cudaMemcpy(node_map_device, node_map, lenght_node_map*pos_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of node_map" << std::endl;
}
error_code=cudaMalloc((void**)&interp_value_device, result_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of interp_value_device " << std::endl;
}
auto elapsed_moving = std::chrono::steady_clock::now() - start;
float ms_moving = std::chrono::duration_cast<ms>(elapsed_moving).count();
cudaDeviceSynchronize();
//1d
#if 0
int block_size = 1024;
int num_blocks = (result_size + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
#endif
//2d
int block_size2 = 32; //each block will have block_size2*block_size2 threads
dim3 num_blocks2(block_size2, block_size2);
int x_grid = (total_action_number + block_size2 - 1) / block_size2;
int y_grid = (n_sim + block_size2 - 1) / block_size2;
dim3 grid_size2(x_grid, y_grid);
std::cout <<"grid:"<< x_grid<<" x "<< y_grid<<std::endl;
interp_kernel2D <<< grid_size2, num_blocks2 >>> (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
std::cout << "Cuda kernel failed! " << cudaGetErrorString(err) <<std::endl;
}
start = std::chrono::steady_clock::now();
cudaMemcpy(interp_value_gpu, interp_value_device, result_size*fl_size, cudaMemcpyDeviceToHost);
auto elapsed_moving_back = std::chrono::steady_clock::now() - start;
float ms_moving_back = std::chrono::duration_cast<ms>(elapsed_moving_back).count();
std::cout << "Time spent moving the data to the GPU:" << ms_moving << " ms"<<std::endl;
std::cout << "Time spent moving the results back to the host: " << ms_moving_back << " ms" << std::endl;
cudaFree(interp_value_device);
cudaFree(weights_device);
cudaFree(node_map_device);
cudaFree(grid_values_device);
}
$ nvcc -arch=sm_52 -o t375 t375.cu -std=c++11
$ cuda-memcheck ./t375
========= CUDA-MEMCHECK
done filling!
2.69079MB allocated
Crunching values on the CPU (serial): 30.081s
size grid_values: 5500000
size weights: 2112000
size interp_value: 66000000
grid:2063 x 32
Time spent moving the data to the GPU:31 ms
Time spent moving the results back to the host: 335 ms
Crunching values on the GPU: 7.089s
Performance: -5.73452 % less time than parallel CPU!
Perfect match!
========= ERROR SUMMARY: 0 errors
$
请注意,cuda-memcheck
会减慢程序在 GPU 上的执行速度,以进行严格的内存边界检查。因此性能可能与普通情况不符。这是“普通”运行的样子:
$ ./t375
done filling!
2.69079MB allocated
Crunching values on the CPU (serial): 30.273s
size grid_values: 5500000
size weights: 2112000
size interp_value: 66000000
grid:2063 x 32
Time spent moving the data to the GPU:32 ms
Time spent moving the results back to the host: 332 ms
Crunching values on the GPU: 1.161s
Performance: -84.6596 % less time than parallel CPU!
Perfect match!
$
关于c++ - CUDA:填充列主矩阵,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/45398168/
#include using namespace std; class C{ private: int value; public: C(){ value = 0;
这个问题已经有答案了: What is the difference between char a[] = ?string?; and char *p = ?string?;? (8 个回答) 已关闭
关闭。此题需要details or clarity 。目前不接受答案。 想要改进这个问题吗?通过 editing this post 添加详细信息并澄清问题. 已关闭 7 年前。 此帖子已于 8 个月
除了调试之外,是否有任何针对 c、c++ 或 c# 的测试工具,其工作原理类似于将独立函数复制粘贴到某个文本框,然后在其他文本框中输入参数? 最佳答案 也许您会考虑单元测试。我推荐你谷歌测试和谷歌模拟
我想在第二台显示器中移动一个窗口 (HWND)。问题是我尝试了很多方法,例如将分辨率加倍或输入负值,但它永远无法将窗口放在我的第二台显示器上。 关于如何在 C/C++/c# 中执行此操作的任何线索 最
我正在寻找 C/C++/C## 中不同类型 DES 的现有实现。我的运行平台是Windows XP/Vista/7。 我正在尝试编写一个 C# 程序,它将使用 DES 算法进行加密和解密。我需要一些实
很难说出这里要问什么。这个问题模棱两可、含糊不清、不完整、过于宽泛或夸夸其谈,无法以目前的形式得到合理的回答。如需帮助澄清此问题以便重新打开,visit the help center . 关闭 1
有没有办法强制将另一个 窗口置于顶部? 不是应用程序的窗口,而是另一个已经在系统上运行的窗口。 (Windows, C/C++/C#) 最佳答案 SetWindowPos(that_window_ha
假设您可以在 C/C++ 或 Csharp 之间做出选择,并且您打算在 Windows 和 Linux 服务器上运行同一服务器的多个实例,那么构建套接字服务器应用程序的最明智选择是什么? 最佳答案 如
你们能告诉我它们之间的区别吗? 顺便问一下,有什么叫C++库或C库的吗? 最佳答案 C++ 标准库 和 C 标准库 是 C++ 和 C 标准定义的库,提供给 C++ 和 C 程序使用。那是那些词的共同
下面的测试代码,我将输出信息放在注释中。我使用的是 gcc 4.8.5 和 Centos 7.2。 #include #include class C { public:
很难说出这里问的是什么。这个问题是含糊的、模糊的、不完整的、过于宽泛的或修辞性的,无法以目前的形式得到合理的回答。如需帮助澄清此问题以便重新打开它,visit the help center 。 已关
我的客户将使用名为 annoucement 的结构/类与客户通信。我想我会用 C++ 编写服务器。会有很多不同的类继承annoucement。我的问题是通过网络将这些类发送给客户端 我想也许我应该使用
我在 C# 中有以下函数: public Matrix ConcatDescriptors(IList> descriptors) { int cols = descriptors[0].Co
我有一个项目要编写一个函数来对某些数据执行某些操作。我可以用 C/C++ 编写代码,但我不想与雇主共享该函数的代码。相反,我只想让他有权在他自己的代码中调用该函数。是否可以?我想到了这两种方法 - 在
我使用的是编写糟糕的第 3 方 (C/C++) Api。我从托管代码(C++/CLI)中使用它。有时会出现“访问冲突错误”。这使整个应用程序崩溃。我知道我无法处理这些错误[如果指针访问非法内存位置等,
关闭。这个问题不符合Stack Overflow guidelines .它目前不接受答案。 我们不允许提问寻求书籍、工具、软件库等的推荐。您可以编辑问题,以便用事实和引用来回答。 关闭 7 年前。
已关闭。此问题不符合Stack Overflow guidelines 。目前不接受答案。 要求我们推荐或查找工具、库或最喜欢的场外资源的问题对于 Stack Overflow 来说是偏离主题的,因为
我有一些 C 代码,将使用 P/Invoke 从 C# 调用。我正在尝试为这个 C 函数定义一个 C# 等效项。 SomeData* DoSomething(); struct SomeData {
这个问题已经有答案了: Why are these constructs using pre and post-increment undefined behavior? (14 个回答) 已关闭 6
我是一名优秀的程序员,十分优秀!