gpt4 book ai didi

c++ - 连续 block 从初始 block 读取内存

转载 作者:行者123 更新时间:2023-11-28 00:23:00 25 4
gpt4 key购买 nike

所以这是我程序的一部分,我对两个类(class)进行了减和。我通过共享数组 __shared__ int nrules[max_threads * MAX_CLASSES]; 的一半索引类,所以第一类从 nrules[0] 开始,第二类从 开始nrules[blockDim.x 或 max_threads]。减少了两半。总和保存在作为参数传递的全局数组中,该数组将保留每个 block 的总和,因此由 blockIdx.x 索引。

我有一个测试用例的大小,用MAX_SIZE表示,所有测试先从1处理到MAX_SIZE,求和在全局累加每个 block 的数组。

我想调用一个 block 数等于我的测试数 (10000) 的内核,但是总和有一些问题,所以我改为按步骤进行。

我找不到解决这个问题的方法,但是每当我调用一个 block 数超过 max_threads 的内核时,它就会开始从初始 block 中求和。如果执行代码,您会看到它将打印每个 block 的值,在本例中为 64,每个 block 有 64 个线程。如果我再执行至少一个 block ,它的总和将改为 128。这是第一类总和。就好像偏移变量什么都不做,写入又发生在第一个 block 上。当 MAX_SIZE = 3 时,第一个 block 的第二类总和更改为 192。 这里的 Cuda 功能是 2.0,一张 GT 520 卡。使用 CUDA 6.5 编译。

#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);

}
}

#define MAX_CLASSES 2
#define max_threads 64
//#define MAX_FEATURES 65

__device__ __constant__ int d_MAX_SIZE;
__device__ __constant__ int offset;

__device__ void rules_points_reduction(float points[max_threads * MAX_CLASSES], int nrules[max_threads * MAX_CLASSES]){

float psum[MAX_CLASSES];
int nsum[MAX_CLASSES];

for (int i = 0; i < MAX_CLASSES; i++){
psum[i] = points[threadIdx.x + i * blockDim.x];
nsum[i] = nrules[threadIdx.x + i * blockDim.x];
}

__syncthreads();

if (blockDim.x >= 1024) {
if (threadIdx.x < 512) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 512 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 512 + i * blockDim.x];
}

} __syncthreads();
}
if (blockDim.x >= 512) {
if (threadIdx.x < 256) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 256 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 256 + i * blockDim.x];
}
} __syncthreads();
}
if (blockDim.x >= 256) {
if (threadIdx.x < 128) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 128 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 128 + i * blockDim.x];
}
} __syncthreads();
}
if (blockDim.x >= 128) {
if (threadIdx.x < 64) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 64 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 64 + i * blockDim.x];
}
} __syncthreads();
}

if (threadIdx.x < 32)
{
// now that we are using warp-synchronous programming (below)
// we need to declare our shared memory volatile so that the compiler
// doesn't reorder stores to it and induce incorrect behavior.
//volatile int* smem = nrules;
//volatile float* smemf = points;
if (blockDim.x >= 64) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 32 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 32 + i * blockDim.x];
}
}
if (blockDim.x >= 32) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 16 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 16 + i * blockDim.x];
}
}
if (blockDim.x >= 16) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 8 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 8 + i * blockDim.x];
}
}
if (blockDim.x >= 8) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 4 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 4 + i * blockDim.x];
}
}
if (blockDim.x >= 4) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 2 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 2 + i * blockDim.x];
}
}
if (blockDim.x >= 2) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 1 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 1 + i * blockDim.x];
}
}
}

}

__device__ void d_get_THE_prediction(short k, float* finalpoints, int* gn_rules)
{
int max;
short true_label, n_items;

__shared__ float points[max_threads * MAX_CLASSES];
__shared__ int nrules[max_threads * MAX_CLASSES];
//__shared__ short items[MAX_FEATURES], ele[MAX_FEATURES];
__shared__ int max2;

for (int i = 0; i < MAX_CLASSES; i++)
{
points[threadIdx.x + i * blockDim.x] = 1;
nrules[threadIdx.x + i * blockDim.x] = 1;
}

if (threadIdx.x == 0) {
if (k == 1){
nrules[0] = 1;
nrules[blockDim.x] = 1;
}
//max2 = GetBinCoeff_l_d(n_items, k);
}
__syncthreads();

//max = max2;

//d_induce_rules(items, ele, n_items, k, max, nrules, points);

__syncthreads();

rules_points_reduction(points, nrules);

if (threadIdx.x == 0){

for (int i = 0; i < MAX_CLASSES; i++){
gn_rules[(blockIdx.x + offset) + i * blockDim.x] += nrules[i * blockDim.x];
finalpoints[(blockIdx.x + offset) + i * blockDim.x] += points[i * blockDim.x];

}
printf("block %d k%d %f %f %d %d\n", (blockIdx.x + offset), k, finalpoints[(blockIdx.x + offset)],
finalpoints[(blockIdx.x + offset) + blockDim.x], gn_rules[(blockIdx.x + offset)], gn_rules[(blockIdx.x + offset) + blockDim.x]);

}
}

__global__ void lazy_supervised_classification_kernel(int k, float* finalpoints, int* n_rules){

d_get_THE_prediction( k, finalpoints, n_rules);

}


int main() {
//freopen("output.txt", "w", stdout);

int N_TESTS = 10000;
int MAX_SIZE = 3;

float *finalpoints = (float*)calloc(MAX_CLASSES * N_TESTS, sizeof(float));
float *d_finalpoints = 0;

int *d_nruls = 0;
int *nruls = (int*)calloc(MAX_CLASSES * N_TESTS, sizeof(int));

gpuErrchk(cudaMalloc(&d_finalpoints, MAX_CLASSES * N_TESTS * sizeof(float)));
gpuErrchk(cudaMemset(d_finalpoints, 0, MAX_CLASSES * N_TESTS * sizeof(float)));

gpuErrchk(cudaMalloc(&d_nruls, MAX_CLASSES * N_TESTS * sizeof(int)));
gpuErrchk(cudaMemset(d_nruls, 0, MAX_CLASSES * N_TESTS * sizeof(int)));

gpuErrchk(cudaMemcpyToSymbol(d_MAX_SIZE, &MAX_SIZE, sizeof(int), 0, cudaMemcpyHostToDevice));

int step = max_threads, ofset = 0;

for (int k = 1; k < MAX_SIZE; k++){

//N_TESTS-step
for (ofset = 0; ofset < max_threads; ofset += step){

gpuErrchk(cudaMemcpyToSymbol(offset, &ofset, sizeof(int), 0, cudaMemcpyHostToDevice));
lazy_supervised_classification_kernel <<<step, max_threads >>>(k, d_finalpoints, d_nruls);
gpuErrchk(cudaDeviceSynchronize());
}

gpuErrchk(cudaMemcpyToSymbol(offset, &ofset, sizeof(int), 0, cudaMemcpyHostToDevice));//comment these lines
//N_TESTS - step
lazy_supervised_classification_kernel <<<3, max_threads >> >(k, d_finalpoints, d_nruls);//
gpuErrchk(cudaDeviceSynchronize());//

}
gpuErrchk(cudaFree(d_finalpoints));
gpuErrchk(cudaFree(d_nruls));
free(finalpoints);
free(nruls);

gpuErrchk(cudaDeviceReset());
return(0);
}

最佳答案

我不相信这个索引是你想要的:

 gn_rules[(blockIdx.x + offset) + i * blockDim.x] += ...;
finalpoints[(blockIdx.x + offset) + i * blockDim.x] += ...;

对于 MAX_CLASSES = 2,每个 block 需要存储 2 个 finalpoints 值和 2 个 gn_rules 值。因此,当 offset 不为零时,它需要按 MAX_CLASSES 值缩放,以便索引到该 block 正确存储的开始。

因此,如果您将上面的代码行更改为:

 gn_rules[(blockIdx.x + (offset*MAX_CLASSES)) + i * blockDim.x] += nrules[i * blockDim.x];
finalpoints[(blockIdx.x + (offset*MAX_CLASSES)) + i * blockDim.x] += points[i * blockDim.x];

我相信你会得到你期望的输出。

关于c++ - 连续 block 从初始 block 读取内存,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/26539459/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com