- html - 出于某种原因,IE8 对我的 Sass 文件中继承的 html5 CSS 不友好?
- JMeter 在响应断言中使用 span 标签的问题
- html - 在 :hover and :active? 上具有不同效果的 CSS 动画
- html - 相对于居中的 html 内容固定的 CSS 重复背景?
我是 CUDA 的新手,我正在尝试将我为性能关键项目所做的一些繁琐计算卸载到 GPU。我的电脑上有两张 NVS 510 显卡,但我目前只试用一张。
我有一些大的列主矩阵(1000-5000 行 x 1-5 M 列)需要填充。到目前为止,我能够编写代码来像填充数组一样填充矩阵,并且它适用于相对较小的矩阵。
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, [other params],
int n_rows, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_rows;
int row = index % n_rows;
if (row > n_sim || column > num_cols) return;
d_matrix[index] = …something(row, column,[other params]);
}
内核调用:
fl_type *res;
cudaMalloc((void**)&res, n_columns*n_rows*fl_size);
int block_size = 1024;
int num_blocks = (n_rows* n_columns + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (res,[other params], n_rows,n_columns);
一切正常。如果我更改内核以使用 2D 线程:
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, [other params],
int n_rows, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column* n_rows + row;
if (row > n_rows || column > num_cols) return;
d_matrix[index] = …something(row, column,[other params]);
}
然后我调用它
int block_size2 = 32; //each block will have block_size2*block_size2 threads
dim3 num_blocks2(block_size2, block_size2);
int x_grid = (n_columns + block_size2 - 1) / block_size2;
int y_grid = (n_rows + block_size2 - 1) / block_size2;
dim3 grid_size2(x_grid, y_grid);
interp_kernel2D <<< grid_size2, num_blocks2 >>> (res,[other params], n_rows,n_columns);
结果全为零,CUDA 返回未知错误。我错过了什么?可以在此处找到使用 VS2015 和 CUDA 8.0 编译无误的实际代码:https://pastebin.com/XBCVC7VV
这是来自 pastebin 链接的代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <iostream>
#include <random>
#include <chrono>
typedef float fl_type;
typedef int pos_type;
typedef std::chrono::milliseconds ms;
//declaration of the cuda function
void cuda_interpolation_function(fl_type* interp_value_back, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim);
fl_type iterp_cpu(fl_type* weights, pos_type* node_map, fl_type* grid_values, int& row, int& column, int& interp_dim, int& n_sim) {
int w_p = column*interp_dim;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[node_map[w_p + inter_point] * n_sim + row];
}
return res;
}
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, pos_type* node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_sim;
int row = index % n_sim;
int w_p = column*interp_dim;
if (row > n_sim || column > num_cols) return;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + inter_point] * n_sim];
}
d_matrix[index] = res;
}
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, pos_type* node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column*n_sim + row;
int w_p = column*interp_dim;
if (row > n_sim || column > num_cols) return;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + inter_point] * n_sim];
}
d_matrix[index] = res;
}
void verify(fl_type *host, fl_type *device, int size) {
int count = 0;
int count_zero = 0;
for (int i = 0; i < size; i++) {
if (host[i] != device[i]) {
count++;
//std::cout <<"pos: " <<i<< " CPU:" <<h[i] << ", GPU: " << d[i] <<std::endl;
assert(host[i] == device[i]);
if (device[i] == 0.0)
count_zero++;
}
}
if (count) {
std::cout << "Non matching: " << count << "out of " << size << "(" << (float(count) / size * 100) << "%)" << std::endl;
std::cout << "Zeros returned from the device: " << count_zero <<"(" << (float(count_zero) / size * 100) << "%)" << std::endl;
}
else
std::cout << "Perfect match!" << std::endl;
}
int main() {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
int dim = 5; // range: 2-5
int number_nodes = 5500; // range: 10.000-500.000
int max_actions = 12; // range: 6-200
int n_sim = 1000; // range: 1.000-10.000
int interp_dim = std::pow(2, dim);
int grid_values_size = n_sim*number_nodes;
std::default_random_engine generator;
std::normal_distribution<fl_type> normal_dist(0.0, 1);
std::uniform_int_distribution<> uniform_dist(0, number_nodes - 1);
double bit_allocated = 0;
fl_type * grid_values; //flattened 2d array, containing the value of the grid (n_sims x number_nodes)
grid_values = (fl_type *)malloc(grid_values_size * fl_size);
bit_allocated += grid_values_size * fl_size;
for (int i = 0; i < grid_values_size; i++)
grid_values[i] = normal_dist(generator);
pos_type * map_node2values_start; //vector that maps each node to the first column of the result matrix regarding that done
pos_type * map_node2values_how_many; //vector that stores how many action we have per node
map_node2values_start = (pos_type *)malloc(number_nodes * pos_size);
map_node2values_how_many = (pos_type *)malloc(number_nodes * pos_size);
bit_allocated += 2 * (number_nodes * pos_size);
for (int i = 0; i < number_nodes; i++) {
//each node as simply max_actions
map_node2values_start[i] = max_actions*i;
map_node2values_how_many[i] = max_actions;
}
//total number of actions, which is amount of column of the results
int total_action_number = map_node2values_start[number_nodes - 1] + map_node2values_how_many[number_nodes - 1];
//vector that keep tracks of the columnt to grab, and their weight in the interpolation
fl_type* weights;
pos_type * node_map;
weights = (fl_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * fl_size;
node_map = (pos_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * pos_size;
//filling with random numbers
for (int i = 0; i < total_action_number*interp_dim; i++) {
node_map[i] = uniform_dist(generator); // picking random column
weights[i] = 1.0 / interp_dim; // uniform weights
}
std::cout << "done filling!" << std::endl;
std::cout << bit_allocated / 8 / 1024 / 1024 << "MB allocated" << std::endl;
int result_size = n_sim*total_action_number;
fl_type *interp_value_cpu;
bit_allocated += result_size* fl_size;
interp_value_cpu = (fl_type *)malloc(result_size* fl_size);
auto start = std::chrono::steady_clock::now();
for (int row = 0; row < n_sim; row++) {
for (int column = 0; column < total_action_number; column++) {
auto zz = iterp_cpu(weights, node_map, grid_values, row, column, interp_dim, n_sim);
interp_value_cpu[column*n_sim + row] = zz;
}
}
auto elapsed_cpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the CPU (serial): " << std::chrono::duration_cast<ms>(elapsed_cpu).count() / 1000.0 << "s" << std::endl;
int * pp;
cudaMalloc((void**)&pp, sizeof(int)); //initializing the device, to not affect the benchmark
fl_type *interp_value_gpu;
interp_value_gpu = (fl_type *)malloc(result_size* fl_size);
start = std::chrono::steady_clock::now();
cuda_interpolation_function(interp_value_gpu, result_size, grid_values, grid_values_size, weights, node_map, total_action_number, interp_dim, n_sim);
auto elapsed_gpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the GPU: " << std::chrono::duration_cast<ms>(elapsed_gpu).count() / 1000.0 << "s" << std::endl;
float ms_cpu = std::chrono::duration_cast<ms>(elapsed_cpu).count();
float ms_gpu = std::chrono::duration_cast<ms>(elapsed_gpu).count();
int n_proc = 4;
std::cout << "Performance: " << (ms_gpu- ms_cpu / n_proc) / (ms_cpu / n_proc) * 100 << " % less time than parallel CPU!" << std::endl;
verify(interp_value_cpu, interp_value_gpu, result_size);
free(interp_value_cpu);
free(interp_value_gpu);
free(grid_values);
free(node_map);
free(weights);
}
void cuda_interpolation_function(fl_type* interp_value_gpu, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim) {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
auto start = std::chrono::steady_clock::now();
//device versions of the inputs
fl_type * grid_values_device;
fl_type* weights_device;
pos_type * node_map_device;
fl_type *interp_value_device;
int lenght_node_map = interp_dim*total_action_number;
std::cout << "size grid_values: " << grid_values_size <<std::endl;
std::cout << "size weights: " << lenght_node_map << std::endl;
std::cout << "size interp_value: " << result_size << std::endl;
//allocating and moving to the GPU the inputs
auto error_code=cudaMalloc((void**)&grid_values_device, grid_values_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the grid_values" << std::endl;
}
error_code=cudaMemcpy(grid_values_device, grid_values, grid_values_size*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the grid_values" << std::endl;
}
error_code=cudaMalloc((void**)&weights_device, lenght_node_map*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the weights" << std::endl;
}
error_code=cudaMemcpy(weights_device, weights, lenght_node_map*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the weights" << std::endl;
}
error_code=cudaMalloc((void**)&node_map_device, lenght_node_map*pos_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of node_map" << std::endl;
}
error_code=cudaMemcpy(node_map_device, node_map, lenght_node_map*pos_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of node_map" << std::endl;
}
error_code=cudaMalloc((void**)&interp_value_device, result_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of interp_value_device " << std::endl;
}
auto elapsed_moving = std::chrono::steady_clock::now() - start;
float ms_moving = std::chrono::duration_cast<ms>(elapsed_moving).count();
cudaDeviceSynchronize();
//1d
int block_size = 1024;
int num_blocks = (result_size + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
//2d
//int block_size2 = 32; //each block will have block_size2*block_size2 threads
//dim3 num_blocks2(block_size2, block_size2);
//int x_grid = (total_action_number + block_size2 - 1) / block_size2;
//int y_grid = (n_sim + block_size2 - 1) / block_size2;
//dim3 grid_size2(x_grid, y_grid);
//std::cout <<"grid:"<< x_grid<<" x "<< y_grid<<std::endl;
//interp_kernel2D <<< grid_size2, num_blocks2 >>> (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
std::cout << "Cuda kernel failed! " << cudaGetErrorString(err) <<std::endl;
}
start = std::chrono::steady_clock::now();
cudaMemcpy(interp_value_gpu, interp_value_device, result_size*fl_size, cudaMemcpyDeviceToHost);
auto elapsed_moving_back = std::chrono::steady_clock::now() - start;
float ms_moving_back = std::chrono::duration_cast<ms>(elapsed_moving_back).count();
std::cout << "Time spent moving the data to the GPU:" << ms_moving << " ms"<<std::endl;
std::cout << "Time spent moving the results back to the host: " << ms_moving_back << " ms" << std::endl;
cudaFree(interp_value_device);
cudaFree(weights_device);
cudaFree(node_map_device);
cudaFree(grid_values_device);
}
此外,我将非常感谢任何关于如何提高代码性能的指导。
最佳答案
任何时候您在使用 CUDA 代码时遇到问题,我建议您进行适当的 CUDA 错误检查(您似乎大部分时间都在这样做),并且还使用 cuda-memcheck 运行您的代码
。最后一个实用程序类似于 Nsight VSE 中的“启用内存检查器”,但不完全相同。然而,Nsight VSE 内存检查器可能给了您相同的指示。
在 C(或 C++)中,数组的索引通常从 0 开始。因此,要测试越界索引,我必须检查生成的索引是否等于或大于 em> 数组的大小。但在您的情况下,您只测试大于:
if (row > n_sim || column > num_cols) return;
您在 1D 内核和 2D 内核中都犯了类似的错误,尽管您认为 1D 内核工作正常,但它实际上正在进行越界访问。如果您使用上述 cuda-memcheck
实用程序(或者可能还使用可以在 Nsight VSE 中启用的内存检查器)运行,您可以验证这一点。
当我在 pastebin 链接中修改您的代码以使用正确的范围/边界检查时,cuda-memcheck
报告没有错误,并且您的程序报告了正确的结果。我已经测试了这两种情况,但下面的代码是根据您的 pastebin 链接修改的,以取消注释 2D 情况,并使用它代替 1D 情况:
$ cat t375.cu | more
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <iostream>
#include <random>
#include <chrono>
typedef float fl_type;
typedef int pos_type;
typedef std::chrono::milliseconds ms;
//declaration of the cuda function
void cuda_interpolation_function(fl_type* interp_value_back, int result_size, fl
_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map,
int total_action_number, int interp_dim, int n_sim);
fl_type iterp_cpu(fl_type* weights, pos_type* node_map, fl_type* grid_values, in
t& row, int& column, int& interp_dim, int& n_sim) {
int w_p = column*interp_dim;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[node_map[w_p + inter_poi
nt] * n_sim + row];
}
return res;
}
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, pos_type* no
de_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_sim;
int row = index % n_sim;
int w_p = column*interp_dim;
if (row >= n_sim || column >= num_cols) return; // modified
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + int
er_point] * n_sim];
}
d_matrix[index] = res;
}
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, pos_type*
node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column*n_sim + row;
int w_p = column*interp_dim;
if (row >= n_sim || column >= num_cols) return; // modified
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + int
er_point] * n_sim];
}
d_matrix[index] = res;
}
void verify(fl_type *host, fl_type *device, int size) {
int count = 0;
int count_zero = 0;
for (int i = 0; i < size; i++) {
if (host[i] != device[i]) {
count++;
//std::cout <<"pos: " <<i<< " CPU:" <<h[i] << ", GPU: " << d[
i] <<std::endl;
assert(host[i] == device[i]);
if (device[i] == 0.0)
count_zero++;
}
}
if (count) {
std::cout << "Non matching: " << count << "out of " << size << "(" << (f
loat(count) / size * 100) << "%)" << std::endl;
std::cout << "Zeros returned from the device: " << count_zero <<"(" << (
float(count_zero) / size * 100) << "%)" << std::endl;
}
else
std::cout << "Perfect match!" << std::endl;
}
int main() {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
int dim = 5; // range: 2-5
int number_nodes = 5500; // range: 10.000-500.000
int max_actions = 12; // range: 6-200
int n_sim = 1000; // range: 1.000-10.000
int interp_dim = std::pow(2, dim);
int grid_values_size = n_sim*number_nodes;
std::default_random_engine generator;
std::normal_distribution<fl_type> normal_dist(0.0, 1);
std::uniform_int_distribution<> uniform_dist(0, number_nodes - 1);
double bit_allocated = 0;
fl_type * grid_values; //flattened 2d array, containing the value of the grid (n_sims x number_nodes)
grid_values = (fl_type *)malloc(grid_values_size * fl_size);
bit_allocated += grid_values_size * fl_size;
for (int i = 0; i < grid_values_size; i++)
grid_values[i] = normal_dist(generator);
pos_type * map_node2values_start; //vector that maps each node to the first column of the result matrix regarding that done
pos_type * map_node2values_how_many; //vector that stores how many action we have per node
map_node2values_start = (pos_type *)malloc(number_nodes * pos_size);
map_node2values_how_many = (pos_type *)malloc(number_nodes * pos_size);
bit_allocated += 2 * (number_nodes * pos_size);
for (int i = 0; i < number_nodes; i++) {
//each node as simply max_actions
map_node2values_start[i] = max_actions*i;
map_node2values_how_many[i] = max_actions;
}
//total number of actions, which is amount of column of the results
int total_action_number = map_node2values_start[number_nodes - 1] + map_node2values_how_many[number_nodes - 1];
//vector that keep tracks of the columnt to grab, and their weight in the interpolation
fl_type* weights;
pos_type * node_map;
weights = (fl_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * fl_size;
node_map = (pos_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * pos_size;
//filling with random numbers
for (int i = 0; i < total_action_number*interp_dim; i++) {
node_map[i] = uniform_dist(generator); // picking random column
weights[i] = 1.0 / interp_dim; // uniform weights
}
std::cout << "done filling!" << std::endl;
std::cout << bit_allocated / 8 / 1024 / 1024 << "MB allocated" << std::endl;
int result_size = n_sim*total_action_number;
fl_type *interp_value_cpu;
bit_allocated += result_size* fl_size;
interp_value_cpu = (fl_type *)malloc(result_size* fl_size);
auto start = std::chrono::steady_clock::now();
for (int row = 0; row < n_sim; row++) {
for (int column = 0; column < total_action_number; column++) {
auto zz = iterp_cpu(weights, node_map, grid_values, row, column, interp_dim, n_sim);
interp_value_cpu[column*n_sim + row] = zz;
}
}
auto elapsed_cpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the CPU (serial): " << std::chrono::duration_cast<ms>(elapsed_cpu).count() / 1000.0 << "s" << std::endl;
int * pp;
cudaMalloc((void**)&pp, sizeof(int)); //initializing the device, to not affect the benchmark
fl_type *interp_value_gpu;
interp_value_gpu = (fl_type *)malloc(result_size* fl_size);
start = std::chrono::steady_clock::now();
cuda_interpolation_function(interp_value_gpu, result_size, grid_values, grid_values_size, weights, node_map, total_action_number, interp_dim, n_sim);
auto elapsed_gpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the GPU: " << std::chrono::duration_cast<ms>(elapsed_gpu).count() / 1000.0 << "s" << std::endl;
float ms_cpu = std::chrono::duration_cast<ms>(elapsed_cpu).count();
float ms_gpu = std::chrono::duration_cast<ms>(elapsed_gpu).count();
int n_proc = 4;
std::cout << "Performance: " << (ms_gpu- ms_cpu / n_proc) / (ms_cpu / n_proc) * 100 << " % less time than parallel CPU!" << std::endl;
verify(interp_value_cpu, interp_value_gpu, result_size);
free(interp_value_cpu);
free(interp_value_gpu);
free(grid_values);
free(node_map);
free(weights);
}
void cuda_interpolation_function(fl_type* interp_value_gpu, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim) {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
auto start = std::chrono::steady_clock::now();
//device versions of the inputs
fl_type * grid_values_device;
fl_type* weights_device;
pos_type * node_map_device;
fl_type *interp_value_device;
int lenght_node_map = interp_dim*total_action_number;
std::cout << "size grid_values: " << grid_values_size <<std::endl;
std::cout << "size weights: " << lenght_node_map << std::endl;
std::cout << "size interp_value: " << result_size << std::endl;
//allocating and moving to the GPU the inputs
auto error_code=cudaMalloc((void**)&grid_values_device, grid_values_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the grid_values" << std::endl;
}
error_code=cudaMemcpy(grid_values_device, grid_values, grid_values_size*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the grid_values" << std::endl;
}
error_code=cudaMalloc((void**)&weights_device, lenght_node_map*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the weights" << std::endl;
}
error_code=cudaMemcpy(weights_device, weights, lenght_node_map*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the weights" << std::endl;
}
error_code=cudaMalloc((void**)&node_map_device, lenght_node_map*pos_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of node_map" << std::endl;
}
error_code=cudaMemcpy(node_map_device, node_map, lenght_node_map*pos_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of node_map" << std::endl;
}
error_code=cudaMalloc((void**)&interp_value_device, result_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of interp_value_device " << std::endl;
}
auto elapsed_moving = std::chrono::steady_clock::now() - start;
float ms_moving = std::chrono::duration_cast<ms>(elapsed_moving).count();
cudaDeviceSynchronize();
//1d
#if 0
int block_size = 1024;
int num_blocks = (result_size + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
#endif
//2d
int block_size2 = 32; //each block will have block_size2*block_size2 threads
dim3 num_blocks2(block_size2, block_size2);
int x_grid = (total_action_number + block_size2 - 1) / block_size2;
int y_grid = (n_sim + block_size2 - 1) / block_size2;
dim3 grid_size2(x_grid, y_grid);
std::cout <<"grid:"<< x_grid<<" x "<< y_grid<<std::endl;
interp_kernel2D <<< grid_size2, num_blocks2 >>> (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
std::cout << "Cuda kernel failed! " << cudaGetErrorString(err) <<std::endl;
}
start = std::chrono::steady_clock::now();
cudaMemcpy(interp_value_gpu, interp_value_device, result_size*fl_size, cudaMemcpyDeviceToHost);
auto elapsed_moving_back = std::chrono::steady_clock::now() - start;
float ms_moving_back = std::chrono::duration_cast<ms>(elapsed_moving_back).count();
std::cout << "Time spent moving the data to the GPU:" << ms_moving << " ms"<<std::endl;
std::cout << "Time spent moving the results back to the host: " << ms_moving_back << " ms" << std::endl;
cudaFree(interp_value_device);
cudaFree(weights_device);
cudaFree(node_map_device);
cudaFree(grid_values_device);
}
$ nvcc -arch=sm_52 -o t375 t375.cu -std=c++11
$ cuda-memcheck ./t375
========= CUDA-MEMCHECK
done filling!
2.69079MB allocated
Crunching values on the CPU (serial): 30.081s
size grid_values: 5500000
size weights: 2112000
size interp_value: 66000000
grid:2063 x 32
Time spent moving the data to the GPU:31 ms
Time spent moving the results back to the host: 335 ms
Crunching values on the GPU: 7.089s
Performance: -5.73452 % less time than parallel CPU!
Perfect match!
========= ERROR SUMMARY: 0 errors
$
请注意,cuda-memcheck
会减慢程序在 GPU 上的执行速度,以进行严格的内存边界检查。因此性能可能与普通情况不符。这是“普通”运行的样子:
$ ./t375
done filling!
2.69079MB allocated
Crunching values on the CPU (serial): 30.273s
size grid_values: 5500000
size weights: 2112000
size interp_value: 66000000
grid:2063 x 32
Time spent moving the data to the GPU:32 ms
Time spent moving the results back to the host: 332 ms
Crunching values on the GPU: 1.161s
Performance: -84.6596 % less time than parallel CPU!
Perfect match!
$
关于c++ - CUDA:填充列主矩阵,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/45398168/
我正在 csv 上使用 hadoop 来分析一些数据。我使用sql/mysql(不确定)来分析数据,现在陷入了僵局。 我花了好几个小时在谷歌上搜索,却没有找到任何相关的东西。我需要一个查询,在该查询中
我正在为 Bootstrap 网格布局的“简单”任务而苦苦挣扎。我希望在大视口(viewport)上有 4 列,然后在中型设备上有 2 列,最后在较小的设备上只有 1 列。 当我测试我的代码片段时,似
对于这个令人困惑的标题,我深表歉意,我想不出这个问题的正确措辞。相反,我只会给你背景信息和目标: 这是在一个表中,一个人可能有也可能没有多行数据,这些行可能包含相同的 activity_id 值,也可
具有 3 列的数据库表 - A int , B int , C int 我的问题是: 如何使用 Sequelize 结果找到 A > B + C const countTasks = await Ta
我在通过以下功能编写此查询时遇到问题: 首先按第 2 列 DESC 排序,然后从“不同的第 1 列”中选择 只有 Column1 是 DISTINCT 此查询没有帮助,因为它首先从第 1 列中进行选择
使用 Bootstrap 非常有趣和有帮助,目前我在创建以下需求时遇到问题。 “使用 bootstrap 在桌面上有 4 列,在平板电脑上有 2 列,在移动设备上有 1 列”谁能告诉我正确的结构 最佳
我是 R 新手,正在问一个非常基本的问题。当然,我在尝试从所提供的示例中获取指导的同时做了功课here和 here ,但无法在我的案例中实现这个想法,即可能是由于我的问题中的比较维度更大。 我的实
通常我会使用 R 并执行 merge.by,但这个文件似乎太大了,部门中的任何一台计算机都无法处理它! (任何从事遗传学工作的人的附加信息)本质上,插补似乎删除了 snp ID 的 rs 数字,我只剩
我有一个 df , delta1 delta2 0 -1 2 0 -1 0 0 0 我想知道如何分配 delt
您好,我想知道是否可以执行以下操作。显然,我已经尝试在 phpMyAdmin 中运行它,但出现错误。也许还有另一种方式来编写此查询。 SELECT * FROM eat_eat_restaurants
我有 2 个列表(标题和数据值)。我想要将数据值列 1 匹配并替换为头文件列 1,以获得与 dataValue 列 1 和标题值列 2 匹配的值 头文件 TotalLoad,M0001001 Hois
我有两个不同长度的文件,file2 是一个很大的引用文件,我从中提取文件 1 的数据。 我有一行 awk,我通常会对其进行调整以在我的文件中进行查找和替换,但它总是在同一列中进行查找和替换。 所以对于
假设我有两个表,如下所示。 create table contract( c_ID number(1) primary key, c_name varchar2(50) not
我有一个带有 varchar 列的 H2 表,其检查约束定义如下: CONSTRAINT my_constraint CHECK (varchar_field <> '') 以下插入语句失败,但当我删
这是最少量的代码,可以清楚地说明我的问题: One Two Three 前 2 个 div 应该是 2 个左列。第三个应该占据页面的其余部分。最后,我将添加选项来隐藏和
在 Azure 中的 Log Analytics 中,我为 VM Heartbeat 选择一个预定义查询,我在编辑器中运行查询正常,但当我去创建警报时,我不断收到警报“查询未返回 TimeGenera
在 Azure 中的 Log Analytics 中,我为 VM Heartbeat 选择一个预定义查询,我在编辑器中运行查询正常,但当我去创建警报时,我不断收到警报“查询未返回 TimeGenera
今天我开始使用 JexcelApi 并遇到了这个:当您尝试从特定位置获取元素时,不是像您通常期望的那样使用sheet.getCell(row,col),而是使用sheet.getCell(col,ro
我有一个包含 28 列的数据库。第一列是代码,第二列是名称,其余是值。 public void displayData() { con.Open(); MySqlDataAdapter
我很沮丧:每当我缩小这个网页时,一切都变得一团糟。我如何将网页居中,以便我可以缩小并且元素不会被错误定位。 (它应该是 2 列,但所有内容都合并为 1)我试过 但由于某种原因,这不起作用。 www.o
我是一名优秀的程序员,十分优秀!