gpt4 book ai didi

cuda - N-Body CUDA 优化

转载 作者:行者123 更新时间:2023-12-04 11:10:14 25 4
gpt4 key购买 nike

我正在 CUDA 中开发 N 体算法,我想学习一些优化技巧。

我设法得到 16384机构运行在 20Flops在具有 27 的 NVIDIA Geforce GTX 260 上流式多处理器。
KernelcomputeForces功能是慢戳取约95%当时,我想知道是否还有其他方法可以优化我的代码。

据我所知,我已经针对内存空间局部性和内存写入进行了优化。在 CUDA 文档的某个地方,它说共享内存更快,但我不知道如何利用它。我已经在 16 中分配了工作块 512每个线程上都有,但这对我来说有点模糊。

请帮助并感谢您阅读本文。

n   is number of bodies

gm is the gpu mass pointer

gpx is the gpu position x pointer

gpy is the gpu position y pointer

gpz is the gpu position z pointer

gfx is the gpu force x pointer

gfy is the gpu force y pointer

gfz is the gpu force z pointer

相关核函数
__global__ void KernelcomputeForces( unsigned int n, float* gm, float* gpx, float* gpy, float* gpz, float* gfx, float* gfy, float* gfz ){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
int numThreads = blockDim.x * gridDim.x;

float GRAVITY = 0.00001f;

//compare all with all
for( unsigned int ia=tid; ia<n; ia+=numThreads ){
float lfx = 0.0f;
float lfy = 0.0f;
float lfz = 0.0f;

for( unsigned int ib=0; ib<n; ib++ ){
//compute distance
float dx = ( gpx[ib] - gpx[ia]);
float dy = ( gpy[ib] - gpy[ia] );
float dz = ( gpz[ib] - gpz[ia] );
//float distance = sqrt( dx*dx + dy*dy + dz*dz );
float distanceSquared = dx*dx + dy*dy + dz*dz;

//prevent slingshots and division by zero
//distance += 0.1f;
distanceSquared += 0.01f;

//calculate gravitational magnitude between the bodies
//float magnitude = GRAVITY * ( gm[ia] * gm[ib] ) / ( distance * distance * distance * distance );
float magnitude = GRAVITY * ( gm[ia] * gm[ib] ) / ( distanceSquared );

//calculate forces for the bodies
//magnitude times direction
lfx += magnitude * ( dx );
lfy += magnitude * ( dy );
lfz += magnitude * ( dz );
}

//stores local memory to global memory
gfx[ia] = lfx;
gfy[ia] = lfy;
gfz[ia] = lfz;
}
}

extern void GPUcomputeForces( unsigned int n, float* gm, float* gpx, float* gpy, float* gpz, float* gfx, float* gfy, float* gfz ){
dim3 gridDim( 16, 1, 1 ); //specifys how many blocks in three possible dimensions
dim3 blockDim( 512, 1, 1 ); //threads per block
KernelcomputeForces<<<gridDim, blockDim>>>( n, gm, gpx, gpy, gpz, gfx, gfy, gfz );
}

最佳答案

@talonmies 已经回答了这个问题,展示了共享内存如何有助于提高性能。所以,没有更多要添加的了。我只想提供一个完整的代码,有不同的优化步骤,包括瓷砖 , 共享内存 , 和 洗牌操作 ,并显示在 Kepler K20c 上执行的一些测试结果。

下面的代码围绕@talonmies 提供的内容进行模制,具有 5核函数,即
KernelcomputeForces : 没有优化
KernelcomputeForces_Shared :利用源质量和共享内存中的平铺
KernelcomputeForces_Tiling :在目的地群众中利用平铺
KernelcomputeForces_Tiling_Shared :利用源和目标块中的平铺并利用共享内存
KernelcomputeForces_Tiling_Shuffle : 同上,但使用 shuffle 操作而不是共享内存。

也许,与文献中已经发表的内容( Fast N-Body Simulation with CUDA )和已经可用的代码(参见上述答案和 Mark Harris' GitHub N-body page )相比,最后一个内核是唯一的新事物。但我已经玩过 N- body,并发现发布此答案很有用,可能对下一个用户有用。

这是代码

#include <stdio.h>

#define GRAVITATIONAL_CONST 6.67*1e−11
#define SOFTENING 1e-9f

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;

if (tid < N) {

float invDist, invDist3;

// --- Initialize register accumulators for forces
float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

// --- Move target particle data to registers
float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

// --- Interact all with all
for(unsigned int ib=0; ib<N; ib++) {

// --- Compute relative distances
float dx = (x_d[ib] - x_reg);
float dy = (y_d[ib] - y_reg);
float dz = (z_d[ib] - z_reg);
float distanceSquared = dx*dx + dy*dy + dz*dz;

// --- Prevent slingshots and division by zero
distanceSquared += SOFTENING;

float invDist = rsqrtf(distanceSquared);
float invDist3 = invDist * invDist * invDist;

// --- Calculate gravitational magnitude between the bodies
float magnitude = GRAVITATIONAL_CONST * (m_reg * m_d[ib]) * invDist3;

// --- Calculate forces for the bodies: magnitude times direction
fx_temp += magnitude*dx;
fy_temp += magnitude*dy;
fz_temp += magnitude*dz;
}

// --- Stores local memory to global memory
fx_d[tid] = fx_temp;
fy_d[tid] = fy_temp;
fz_d[tid] = fz_temp;
}
}

/**********************************/
/* KERNEL FUNCTION: SHARED MEMORY */
/**********************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Shared(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;

if (tid < N) {

float invDist, invDist3;

__shared__ float x_sh[BLOCKSIZE], y_sh[BLOCKSIZE], z_sh[BLOCKSIZE], m_sh[BLOCKSIZE];

// --- Initialize register accumulators for forces
float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

// --- Move target particle data to registers
float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

// --- Interact all with all
for(unsigned int ib=0; ib<N; ib+=BLOCKSIZE) {

// --- Loading data to shared memory
x_sh[threadIdx.x] = x_d[ib + threadIdx.x];
y_sh[threadIdx.x] = y_d[ib + threadIdx.x];
z_sh[threadIdx.x] = z_d[ib + threadIdx.x];
m_sh[threadIdx.x] = m_d[ib + threadIdx.x];

__syncthreads();

#pragma unroll
for(unsigned int ic=0; ic<BLOCKSIZE; ic++) {

// --- Compute relative distances
float dx = (x_sh[ic] - x_reg);
float dy = (y_sh[ic] - y_reg);
float dz = (z_sh[ic] - z_reg);
float distanceSquared = dx*dx + dy*dy + dz*dz;

// --- Prevent slingshots and division by zero
distanceSquared += SOFTENING;

float invDist = rsqrtf(distanceSquared);
float invDist3 = invDist * invDist * invDist;

// --- Calculate gravitational magnitude between the bodies
float magnitude = GRAVITATIONAL_CONST * (m_reg * m_sh[ic]) * invDist3;

// --- Calculate forces for the bodies: magnitude times direction
fx_temp += magnitude*dx;
fy_temp += magnitude*dy;
fz_temp += magnitude*dz;
}
__syncthreads();
}

// --- Stores local memory to global memory
fx_d[tid] = fx_temp;
fy_d[tid] = fy_temp;
fz_d[tid] = fz_temp;
}
}

/***************************/
/* KERNEL FUNCTION: TILING */
/***************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Tiling(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
float invDist, invDist3;

for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
tid < N;
tid += blockDim.x*gridDim.x) {

// --- Initialize register accumulators for forces
float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

// --- Move target particle data to registers
float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

// --- Interact all with all
for(unsigned int ib=0; ib<N; ib++) {

// --- Compute relative distances
float dx = (x_d[ib] - x_reg);
float dy = (y_d[ib] - y_reg);
float dz = (z_d[ib] - z_reg);
float distanceSquared = dx*dx + dy*dy + dz*dz;

// --- Prevent slingshots and division by zero
distanceSquared += SOFTENING;

float invDist = rsqrtf(distanceSquared);
float invDist3 = invDist * invDist * invDist;

// --- Calculate gravitational magnitude between the bodies
float magnitude = GRAVITATIONAL_CONST * (m_reg * m_d[ib]) * invDist3;

// --- Calculate forces for the bodies: magnitude times direction
fx_temp += magnitude*dx;
fy_temp += magnitude*dy;
fz_temp += magnitude*dz;
}
__syncthreads();

// --- Stores local memory to global memory
fx_d[tid] = fx_temp;
fy_d[tid] = fy_temp;
fz_d[tid] = fz_temp;
}
}

/*******************************************/
/* KERNEL FUNCTION: TILING + SHARED MEMORY */
/*******************************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Tiling_Shared(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
float invDist, invDist3;

for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
tid < N;
tid += blockDim.x*gridDim.x) {

__shared__ float x_sh[BLOCKSIZE], y_sh[BLOCKSIZE], z_sh[BLOCKSIZE], m_sh[BLOCKSIZE];

// --- Initialize register accumulators for forces
float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

// --- Move target particle data to registers
float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

// --- Interact all with all
for(unsigned int ib=0; ib<N; ib+=BLOCKSIZE) {

// --- Loading data to shared memory
x_sh[threadIdx.x] = x_d[ib + threadIdx.x];
y_sh[threadIdx.x] = y_d[ib + threadIdx.x];
z_sh[threadIdx.x] = z_d[ib + threadIdx.x];
m_sh[threadIdx.x] = m_d[ib + threadIdx.x];

__syncthreads();

#pragma unroll
for(unsigned int ic=0; ic<BLOCKSIZE; ic++) {

// --- Compute relative distances
float dx = (x_sh[ic] - x_reg);
float dy = (y_sh[ic] - y_reg);
float dz = (z_sh[ic] - z_reg);
float distanceSquared = dx*dx + dy*dy + dz*dz;

// --- Prevent slingshots and division by zero
distanceSquared += SOFTENING;

float invDist = rsqrtf(distanceSquared);
float invDist3 = invDist * invDist * invDist;

// --- Calculate gravitational magnitude between the bodies
float magnitude = GRAVITATIONAL_CONST * (m_reg * m_sh[ic]) * invDist3;

// --- Calculate forces for the bodies: magnitude times direction
fx_temp += magnitude*dx;
fy_temp += magnitude*dy;
fz_temp += magnitude*dz;
}
__syncthreads();
}

// --- Stores local memory to global memory
fx_d[tid] = fx_temp;
fy_d[tid] = fy_temp;
fz_d[tid] = fz_temp;
}
}

/************************************************/
/* KERNEL FUNCTION: TILING + SHUFFLE OPERATIONS */
/************************************************/
__global__
oid KernelcomputeForces_Tiling_Shuffle(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
float invDist, invDist3;

const int laneid = threadIdx.x & 31;

for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
tid < N;
tid += blockDim.x*gridDim.x) {

// --- Initialize register accumulators for forces
float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

// --- Move target particle data to registers
float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

// --- Interact all with all
for(unsigned int ib=0; ib<N; ib+=32) {

// --- Loading data to shared memory
float x_src = x_d[ib + laneid];
float y_src = y_d[ib + laneid];
float z_src = z_d[ib + laneid];
float m_src = m_d[ib + laneid];

#pragma unroll 32
for(unsigned int ic=0; ic<32; ic++) {

// --- Compute relative distances
float dx = (__shfl(x_src, ic) - x_reg);
float dy = (__shfl(y_src, ic) - y_reg);
float dz = (__shfl(z_src, ic) - z_reg);
float distanceSquared = dx*dx + dy*dy + dz*dz;

// --- Prevent slingshots and division by zero
distanceSquared += SOFTENING;

float invDist = rsqrtf(distanceSquared);
float invDist3 = invDist * invDist * invDist;

// --- Calculate gravitational magnitude between the bodies
float magnitude = GRAVITATIONAL_CONST * (m_reg * __shfl(m_src, ic)) * invDist3;

// --- Calculate forces for the bodies: magnitude times direction
fx_temp += magnitude*dx;
fy_temp += magnitude*dy;
fz_temp += magnitude*dz;
}
__syncthreads();
}

// --- Stores local memory to global memory
fx_d[tid] = fx_temp;
fy_d[tid] = fy_temp;
fz_d[tid] = fz_temp;
}
}

/*****************************************/
/* WRAPPER FUNCTION FOR GPU CALCULATIONS */
/*****************************************/
template<int BLOCKSIZE>
float GPUcomputeForces(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N) {
float time;

dim3 grid(iDivUp(N,BLOCKSIZE), 1, 1); // --- Specifys how many blocks in three possible dimensions
dim3 block(BLOCKSIZE, 1, 1); // --- Threads per block

cudaEvent_t start, stop;
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));

gpuErrchk(cudaEventRecord(start, 0));
KernelcomputeForces_Shared<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
//KernelcomputeForces_Tiling<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
//KernelcomputeForces<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());

gpuErrchk(cudaEventRecord(stop, 0));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&time, start, stop));

return time;
}

/*****************************************/
/* WRAPPER FUNCTION FOR GPU CALCULATIONS */
/*****************************************/
template<int GRIDSIZE, int BLOCKSIZE>
float GPUcomputeForces_Tiling(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N) {
float time;

dim3 grid(GRIDSIZE, 1, 1); // --- Specifys how many blocks in three possible dimensions
dim3 block(BLOCKSIZE, 1, 1); // --- Threads per block

cudaEvent_t start, stop;
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));

gpuErrchk(cudaEventRecord(start, 0));
//KernelcomputeForces_Tiling<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
KernelcomputeForces_Tiling_Shuffle<<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());

gpuErrchk(cudaEventRecord(stop, 0));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&time, start, stop));

return time;
}

/********************/
/* CPU CALCULATIONS */
/********************/
void CPUcomputeForces(float* m_h, float* x_h, float* y_h, float* z_h, float* fx_h, float* fy_h, float* fz_h, unsigned int N) {

for (unsigned int i=0; i<N; i++) {

float invDist, invDist3;

float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

// --- Interact all with all
for(unsigned int ib=0; ib<N; ib++) {

// --- Compute relative distances
float dx = (x_h[ib] - x_h[i]);
float dy = (y_h[ib] - y_h[i]);
float dz = (z_h[ib] - z_h[i]);
float distanceSquared = dx*dx + dy*dy + dz*dz;

// --- Prevent slingshots and division by zero
distanceSquared += SOFTENING;

float invDist = 1.f / sqrtf(distanceSquared);
float invDist3 = invDist * invDist * invDist;

// --- Calculate gravitational magnitude between the bodies
float magnitude = GRAVITATIONAL_CONST * (m_h[i] * m_h[ib]) * invDist3;

// --- Calculate forces for the bodies: magnitude times direction
fx_temp += magnitude*dx;
fy_temp += magnitude*dy;
fz_temp += magnitude*dz;

}

// --- Stores local memory to global memory
fx_h[i] = fx_temp;
fy_h[i] = fy_temp;
fz_h[i] = fz_temp;
}
}

/********/
/* MAIN */
/********/
int main(void)
{
const int N = 16384;

size_t gsize = sizeof(float) * size_t(N);

float * g[10], * _g[7];

for(int i=0; i<7; i++) gpuErrchk( cudaMalloc((void **)&_g[i], gsize));

for(int i=0; i<10; i++) g[i] = (float *)malloc(gsize);

for(int i=0; i<N; i++)
for(int j=0; j<4; j++)
*(g[j]+i) = (float)rand();

for(int i=0; i<4; i++) gpuErrchk(cudaMemcpy(_g[i], g[i], gsize, cudaMemcpyHostToDevice));

// --- Warm up to take context establishment time out.
GPUcomputeForces<512>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N);
//GPUcomputeForces_Tiling<32,512>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N);

// --- Bench runs
printf("1024: %f\n", GPUcomputeForces<512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("512: %f\n", GPUcomputeForces<512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("256: %f\n", GPUcomputeForces<256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("128: %f\n", GPUcomputeForces<128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("64: %f\n", GPUcomputeForces<64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("32: %f\n", GPUcomputeForces<32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));

printf("16, 1024: %f\n", GPUcomputeForces_Tiling<16,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("8, 1024: %f\n", GPUcomputeForces_Tiling<8,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("4, 1024: %f\n", GPUcomputeForces_Tiling<4,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("32, 512: %f\n", GPUcomputeForces_Tiling<32,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("16, 512: %f\n", GPUcomputeForces_Tiling<16,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("8, 512: %f\n", GPUcomputeForces_Tiling<8,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("64, 256: %f\n", GPUcomputeForces_Tiling<64,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("32, 256: %f\n", GPUcomputeForces_Tiling<32,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("16, 256: %f\n", GPUcomputeForces_Tiling<16,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("128,128: %f\n", GPUcomputeForces_Tiling<128,128>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("64, 128: %f\n", GPUcomputeForces_Tiling<64,128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("32, 128: %f\n", GPUcomputeForces_Tiling<32,128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("256,64: %f\n", GPUcomputeForces_Tiling<256,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("128,64: %f\n", GPUcomputeForces_Tiling<128,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("64, 64: %f\n", GPUcomputeForces_Tiling<64,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("512,32: %f\n", GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("256,32: %f\n", GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("128,32: %f\n", GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));

for(int i=8; i<10; i++) gpuErrchk(cudaMemcpy(g[i], _g[i-3], gsize, cudaMemcpyDeviceToHost));

CPUcomputeForces(g[0],g[1],g[2],g[3],g[4],g[5],g[6],N);

for(int i=0; i<N; i++)
for(int j=8; j<10; j++) {

if (abs(*(g[j]+i) - *(g[j-3]+i)) > 0.001f) {
printf("Error at %i, %i; GPU = %f; CPU = %f\n",i, (j-8), *(g[j-3]+i), *(g[j]+i));
return;
}
}

printf("Test passed!\n");

cudaDeviceReset();

return 0;
}

以下是 N = 16384 的一些结果.
KernelcomputeForces : 最佳 BLOCKSIZE = 128 ; t = 10.07ms .
KernelcomputeForces_Shared : 最佳 BLOCKSIZE = 128 ; t = 7.04ms .
KernelcomputeForces_Tiling : 最佳 BLOCKSIZE = 128 ; t = 11.14ms .
KernelcomputeForces_Tiling_Shared : 最佳 GRIDSIZE = 64 ;最佳 BLOCKSIZE = 256 ; t = 7.20ms .
KernelcomputeForces_Tiling_Shuffle : 最佳 GRIDSIZE = 128 ;最佳 BLOCKSIZE = 128 ; t = 6.84ms .

Warp shuffle 操作似乎非常轻微地提高了对共享内存的性能。

关于cuda - N-Body CUDA 优化,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/5611905/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com