gpt4 book ai didi

performance - 如何以最大性能标准化 CUDA 中的矩阵列?

转载 作者:行者123 更新时间:2023-12-02 03:32:52 31 4
gpt4 key购买 nike

如何在CUDA中有效标准化矩阵列?

我的矩阵以列主存储,典型大小为2000x200。

该操作可以用以下 matlab 代码表示。

A = rand(2000,200);

A = exp(A);
A = A./repmat(sum(A,1), [size(A,1) 1]);

这可以通过 Thrust、cuBLAS 和/或 cuNPP 有效完成吗?

包含4个内核的快速实现如下所示。

想知道这些是否可以在 1 个或 2 个内核中完成以提高性能,特别是对于 cublasDgemv() 实现的列求和步骤。

#include <cuda.h>
#include <curand.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/transform.h>
#include <thrust/iterator/constant_iterator.h>
#include <math.h>

struct Exp
{
__host__ __device__ void operator()(double& x)
{
x = exp(x);
}
};

struct Inv
{
__host__ __device__ void operator()(double& x)
{
x = (double) 1.0 / x;
}
};

int main()
{
cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
cublasHandle_t hd;
curandGenerator_t rng;
cublasCreate(&hd);
curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT);

const size_t m = 2000, n = 200;
const double c1 = 1.0;
const double c0 = 0.0;

thrust::device_vector<double> A(m * n);
thrust::device_vector<double> sum(1 * n);
thrust::device_vector<double> one(m * n, 1.0);

double* pA = thrust::raw_pointer_cast(&A[0]);
double* pSum = thrust::raw_pointer_cast(&sum[0]);
double* pOne = thrust::raw_pointer_cast(&one[0]);

for (int i = 0; i < 100; i++)
{
curandGenerateUniformDouble(rng, pA, A.size());


thrust::for_each(A.begin(), A.end(), Exp());

cublasDgemv(hd, CUBLAS_OP_T, m, n,
&c1, pA, m, pOne, 1, &c0, pSum, 1);

thrust::for_each(sum.begin(), sum.end(), Inv());

cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pA, m, pSum, 1, pA, m);
}

curandDestroyGenerator(rng);
cublasDestroy(hd);

return 0;
}

最佳答案

我比较了 M2090 和 CUDA 5.0 上 3 种方法的性能。

  1. [173.179 us] cublas 实现如问题所示
  2. [733.734 us] 使用 @talonmies 的 thrust::reduce_by_key 实现纯 Thrust
  3. [1.508 毫秒] 使用 thrust::inclusive_scan_by_key 实现纯 Thrust

Performance on A_{2,000 x 200}

可以看出,

  1. cublas 在这种情况下具有最高的性能;
  2. thrust::reduce_by_keythrust::inclusive_scan_by_key 都会启动多个内核,这会导致额外的开销;
  3. thrust::reduce_by_key 相比,thrust::inclusive_scan_by_key 向 DRAM 写入了更多数据,这可能是内核时间较长的原因之一;
  4. cublas 和推力方法之间的主要性能差异在于矩阵列求和。推力较慢可能是因为 thrust::reduce_by_key 旨在对不同长度的段进行缩减,但 cublas_gemv() 只能应用于固定长度段(行/列) .

当矩阵 A 足够大以忽略内核启动开销时,cublas appoach 仍然表现最佳。 A_{20,000 x 2,000} 上的分析结果如下所示。

Performance on A_{20,000 x 2,000}

将第一个 for_each 操作与 @talonmies 所示的 cublasSgemv 调用融合可能会进一步提高性能,但我认为应该使用手工编写的内核而不是 thrust::reduce_by_key

这3种方法的代码如下所示。

#include <cuda.h>
#include <curand.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <math.h>

struct Exp: public thrust::unary_function<double, double>
{
__host__ __device__ double operator()(double x)
{
return exp(x);
}
};

struct Inv: public thrust::unary_function<double, double>
{
__host__ __device__ double operator()(double x)
{
return (double) 1.0 / x;
}
};

template<typename T>
struct MulC: public thrust::unary_function<T, T>
{
T C;
__host__ __device__ MulC(T c) :
C(c)
{
}
__host__ __device__ T operator()(T x)
{
return x * C;
}
};

template<typename T>
struct line2col: public thrust::unary_function<T, T>
{
T C;
__host__ __device__ line2col(T C) :
C(C)
{
}

__host__ __device__ T operator()(T i)
{
return i / C;
}
};

int main()
{
cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
cublasHandle_t hd;
curandGenerator_t rng;
cublasCreate(&hd);
curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT);

const size_t m = 2000, n = 200;
const double c1 = 1.0;
const double c0 = 0.0;

thrust::device_vector<double> A(m * n);
thrust::device_vector<double> B(m * n);
thrust::device_vector<double> C(m * n);
thrust::device_vector<double> sum1(1 * n);
thrust::device_vector<double> sum2(1 * n);
thrust::device_vector<double> one(m * n, 1);

double* pA = thrust::raw_pointer_cast(&A[0]);
double* pB = thrust::raw_pointer_cast(&B[0]);
double* pSum1 = thrust::raw_pointer_cast(&sum1[0]);
double* pSum2 = thrust::raw_pointer_cast(&sum2[0]);
double* pOne = thrust::raw_pointer_cast(&one[0]);

curandGenerateUniformDouble(rng, pA, A.size());

const int count = 2;

for (int i = 0; i < count; i++)
{
thrust::transform(A.begin(), A.end(), B.begin(), Exp());
cublasDgemv(hd, CUBLAS_OP_T, m, n, &c1, pB, m, pOne, 1, &c0, pSum1, 1);
thrust::transform(sum1.begin(), sum1.end(), sum1.begin(), Inv());
cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pB, m, pSum2, 1, pB, m);
}

for (int i = 0; i < count; i++)
{
thrust::reduce_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(),
thrust::make_transform_iterator(A.begin(), Exp()),
thrust::make_discard_iterator(),
sum2.begin());
thrust::transform(
A.begin(), A.end(),
thrust::make_permutation_iterator(
sum2.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))),
C.begin(),
thrust::divides<double>());
}

for (int i = 0; i < count; i++)
{
thrust::inclusive_scan_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(),
thrust::make_transform_iterator(A.begin(), Exp()),
C.begin());
thrust::copy(
thrust::make_permutation_iterator(
C.begin() + m - 1,
thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))),
thrust::make_permutation_iterator(
C.begin() + m - 1,
thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))) + n,
sum2.begin());
thrust::transform(
A.begin(), A.end(),
thrust::make_permutation_iterator(
sum2.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))),
C.begin(),
thrust::divides<double>());
}

curandDestroyGenerator(rng);
cublasDestroy(hd);

return 0;
}

关于performance - 如何以最大性能标准化 CUDA 中的矩阵列?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/14211093/

31 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com