gpt4 book ai didi

c - "more threads"是否意味着更快的速度?

转载 作者:塔克拉玛干 更新时间:2023-11-03 04:24:55 25 4
gpt4 key购买 nike

我在 CUDA 上有一个 Jacobi 实现,但问题是:

我以这种方式分配线程:

#define imin(a,b) (a < b ? a : b)
int dimBlocks, dimThreads;
dimThreads = 256;
dimBlocks = imin(32, (dimThreads + dim - 1)/dimThreads);

但如果我使用 32 个线程,它比使用 256 个线程或更多... 最快

我得到了这些结果:

Sequential times:
9900 5.882000
9900 6.071000

Parallel times:
9900 1.341000 //using 32
9900 1.626000 //using 256

其中 9900 是矩阵宽度......我们可以看到以下内容:

5.882 / 1.34 = 4.39
6.07 / 1.62 = 3.74

所以 32 个线程比 256 个线程更高效?

抱歉,我不知道是否应该上传代码(因为它们有点长),如果您要求我会上传。

编辑:

//Based on doubletony algorithm
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "Jacobi.cuh"

#include "thrust\host_vector.h"
#include "thrust\device_vector.h"
#include "thrust\extrema.h"

#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>

#define imin(a,b) (a < b ? a : b)

// name OF FUNCTION: __copy_vector
// PURPOSE:
// The function will copy a vector.
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// source double* value vector to be copied
// dest double* reference vector copied
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __copy_vector(double *source, double *dest, const int dim)
{
int tIdx = blockDim.x * blockIdx.x + threadIdx.x;
while(tIdx < dim){
dest[tIdx] = source[tIdx];
tIdx += gridDim.x * blockDim.x;
}
}

// name OF FUNCTION: __Jacobi_sum
// PURPOSE:
// The function will execute matrix vector multiplication
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* value B
// C double* reference A*B
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __Jacobi_sum(const double *A,
const double *B,
double *resul,
const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
resul[tIdx] = 0;
for(int i = 0; i < dim; i++)
if(tIdx != i)
resul[tIdx] += A[tIdx * dim + i] * B[i];
tIdx += gridDim.x * blockDim.x;
}
__syncthreads;
}
// name OF FUNCTION: __substract
// PURPOSE:
// The function will execute A-B=resul
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* value B
// C double* reference A-B
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __substract(const double *A,
const double *B,
double *C,
const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
C[tIdx] = A[tIdx] - B[tIdx];
tIdx += gridDim.x * blockDim.x;
}
}
// name OF FUNCTION: __divide
// PURPOSE:
// The function will execute the jacobi division, that is,
// (B-sum)/A[i,i]
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* reference (B-sum)/A[i,i]
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __divide(const double *A, double *B, const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
//if(A[tIdx * dim + tIdx] != 0)
B[tIdx] /= A[tIdx * dim + tIdx];
tIdx += blockDim.x * gridDim.x;
}
}
// name OF FUNCTION: __absolute
// PURPOSE:
// The function will calculate the absolute value for each
// number in an array
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* reference |A[i]|
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __absolute(double *A, const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
if(A[tIdx] < 0)
A[tIdx] = -A[tIdx];
tIdx += blockDim.x * gridDim.x;
}
}
// name OF FUNCTION: Jacobi_Cuda
// PURPOSE:
// The function will calculate a X solution for a linear system
// using Jacobi's Method.
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// Matrix_A double* value Matrix A(coefficients)
// Vector_B double* value Vector B
// Vector_X double* reference Solution
// dim int value Matrix Dimension
// e double value Error allowed
// maxIter int value Maximum iterations allowed
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//

void Jacobi_Cuda(const double *Matrix_A,
const double *Vector_B,
double *Vector_X,
const int dim,
const double e,
const int maxIter,
double *t)
{

/** Host variables **/
int iter = 0; // iter counter
double err = 1; // error between X^k and X^k-1
double *tmp; // temporary for thrust norm
double *norm; // Vector norm

tmp = (double *) malloc(sizeof(double) * dim);
norm = (double *) malloc(sizeof(double));

int dimBlocks, dimThreads;
dimThreads = 64;
dimBlocks = imin(32, (dim + dimThreads - 1)/dimThreads);
/** ************** **/

/** Device variables **/
double *d_Matrix_A, *d_Vector_B, *d_Vector_X, *d_Vector_Y, *d_Vector_Resul;

cudaMalloc((void**)&d_Matrix_A, sizeof(double) * dim * dim);
cudaMalloc((void**)&d_Vector_B, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_X, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_Y, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_Resul, sizeof(double) * dim);

/** **************** **/

/** Initialize **/
cudaMemcpy(d_Matrix_A, Matrix_A, sizeof(double) * dim * dim,
cudaMemcpyHostToDevice);
cudaMemcpy(d_Vector_B, Vector_B, sizeof(double) * dim, cudaMemcpyHostToDevice);
cudaMemcpy(d_Vector_X, Vector_X, sizeof(double) * dim, cudaMemcpyHostToDevice);
/** ********** **/


clock_t start,finish;
double totaltime;
start = clock();

/** Jacobi **/
while(err > e && iter < maxIter){

__copy_vector<<<dimBlocks, dimThreads>>>(d_Vector_X, d_Vector_Y, dim);

__Jacobi_sum<<<dimBlocks, dimThreads>>>(d_Matrix_A, d_Vector_Y,
d_Vector_Resul, dim);
__substract<<<dimBlocks, dimThreads>>>(d_Vector_B, d_Vector_Resul,
d_Vector_X, dim);

__divide<<<dimBlocks, dimThreads>>>(d_Matrix_A, d_Vector_X, dim);

__substract<<<dimBlocks, dimThreads>>>(d_Vector_Y, d_Vector_X,
d_Vector_Resul, dim);
__absolute<<<dimBlocks, dimThreads>>>(d_Vector_Resul, dim);

cudaMemcpy(tmp, d_Vector_Resul, sizeof(double) * dim, cudaMemcpyDeviceToHost);

double *t = thrust::max_element(tmp, tmp + dim); //vector norm

err = *t;

iter++;
}

finish = clock();

totaltime=(double)(finish-start)/CLOCKS_PER_SEC;

*t = totaltime;

cudaMemcpy(Vector_X, d_Vector_X, sizeof(double) * dim,
cudaMemcpyDeviceToHost);
if(iter == maxIter)
puts("Jacobi has reached maxIter!");
/** ****** **/

/** Free memory **/
cudaFree(d_Matrix_A);
cudaFree(d_Vector_B);
cudaFree(d_Vector_X);
cudaFree(d_Vector_Y);
cudaFree(d_Vector_Resul);
free(tmp);
free(norm);
/** *********** **/
}

最佳答案

这取决于你的算法。一些算法根据定义是不可并行化的(例如计算 Fibonacci series )。但这里有一个 parallelizable Jacobi algorithm布朗的礼貌。请注意,求解方程组可以串行或并行求解,这只是编写代码的问题。

简而言之,除非您向我们展示(或至少解释)该算法,否则不可能知道是否更多线程 = 更快。就线程同步而言,CUDA 非常(非常)擅长标准化同步成本,因此(如果您的算法正确),更多线程几乎总是会产生更快的速度。

关于c - "more threads"是否意味着更快的速度?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/12013593/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com